diff options
author | Craig Topper <craig.topper@intel.com> | 2018-11-19 04:33:20 +0000 |
---|---|---|
committer | Craig Topper <craig.topper@intel.com> | 2018-11-19 04:33:20 +0000 |
commit | 3616891046e7f13a758e53dcc6fa73a7c3232b35 (patch) | |
tree | 8126bffd4f9009478ef780060293bc37d774e892 /llvm | |
parent | 053f1eea96eaa8a0a4bb034274fa485655323d32 (diff) | |
download | bcm5719-llvm-3616891046e7f13a758e53dcc6fa73a7c3232b35.tar.gz bcm5719-llvm-3616891046e7f13a758e53dcc6fa73a7c3232b35.zip |
[X86] Use compare with 0 to fill an element with sign bits when sign extending to v2i64 pre-sse4.1
Previously we used an arithmetic shift right by 31, but that requires a copy to preserve the input. So we might as well materialize a zero and compare to it since the comparison will overwrite the register that contains the zeros. This should be one byte shorter.
llvm-svn: 347181
Diffstat (limited to 'llvm')
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/pmul.ll | 130 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/trunc-subvector.ll | 17 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vec_fp_to_int.ll | 20 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-sext-widen.ll | 352 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-sext.ll | 480 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-trunc-math-widen.ll | 67 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-trunc-math.ll | 67 |
8 files changed, 573 insertions, 568 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a8859be9fb1..cf7d7a9b0d1 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -20083,8 +20083,8 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, return SignExt; if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) { - SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr, - DAG.getConstant(31, dl, MVT::i8)); + SDValue Zero = DAG.getConstant(0, dl, CurrVT); + SDValue Sign = DAG.getSetCC(dl, CurrVT, Zero, Curr, ISD::SETGT); SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5}); return DAG.getBitcast(VT, Ext); } @@ -26358,8 +26358,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In); // Fill a vector with sign bits for each element. - SDValue SignBits = DAG.getNode(ISD::SRA, dl, MVT::v4i32, In, - DAG.getConstant(31, dl, MVT::v4i32)); + SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32); + SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT); // Create an unpackl and unpackh to interleave the sign bits then bitcast // to v2i64. diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll index 15263250a7d..0ec7eb40ec8 100644 --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -1302,74 +1302,76 @@ entry: define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { ; SSE2-LABEL: mul_v8i64_sext: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm12, %xmm9 -; SSE2-NEXT: psrad $31, %xmm9 -; SSE2-NEXT: psrad $16, %xmm12 -; SSE2-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] +; SSE2-NEXT: pxor %xmm8, %xmm8 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 ; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm15 = xmm5[0,2,2,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm15, %xmm8 -; SSE2-NEXT: psrad $31, %xmm8 -; SSE2-NEXT: psrad $16, %xmm15 -; SSE2-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm7, %xmm13 -; SSE2-NEXT: psrad $31, %xmm13 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; SSE2-NEXT: pmuludq %xmm4, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] +; SSE2-NEXT: pmuludq %xmm0, %xmm6 +; SSE2-NEXT: paddq %xmm3, %xmm6 +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pmuludq %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE2-NEXT: psllq $32, %xmm6 +; SSE2-NEXT: paddq %xmm6, %xmm0 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] +; SSE2-NEXT: pmuludq %xmm1, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] +; SSE2-NEXT: pmuludq %xmm3, %xmm6 +; SSE2-NEXT: paddq %xmm4, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 ; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm2[2,3,0,1] -; SSE2-NEXT: movdqa %xmm11, %xmm10 -; SSE2-NEXT: psrad $31, %xmm10 -; SSE2-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; SSE2-NEXT: movdqa %xmm2, %xmm14 -; SSE2-NEXT: psrad $31, %xmm14 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: psrad $31, %xmm6 +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE2-NEXT: pmuludq %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; SSE2-NEXT: psllq $32, %xmm6 +; SSE2-NEXT: paddq %xmm6, %xmm1 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] +; SSE2-NEXT: pmuludq %xmm2, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] +; SSE2-NEXT: pmuludq %xmm7, %xmm6 +; SSE2-NEXT: paddq %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pmuludq %xmm7, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE2-NEXT: psllq $32, %xmm6 +; SSE2-NEXT: paddq %xmm6, %xmm2 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: psrad $31, %xmm5 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,1,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,3] -; SSE2-NEXT: pmuludq %xmm0, %xmm5 -; SSE2-NEXT: paddq %xmm4, %xmm5 -; SSE2-NEXT: psllq $32, %xmm5 -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: paddq %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,1,3] -; SSE2-NEXT: pmuludq %xmm7, %xmm1 -; SSE2-NEXT: pmuludq %xmm3, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,1,1,3] -; SSE2-NEXT: pmuludq %xmm3, %xmm4 -; SSE2-NEXT: paddq %xmm4, %xmm1 -; SSE2-NEXT: psllq $32, %xmm1 -; SSE2-NEXT: paddq %xmm7, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,1,1,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,1,1,3] -; SSE2-NEXT: pmuludq %xmm12, %xmm4 -; SSE2-NEXT: paddq %xmm3, %xmm4 -; SSE2-NEXT: psllq $32, %xmm4 -; SSE2-NEXT: pmuludq %xmm12, %xmm2 -; SSE2-NEXT: paddq %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,1,1,3] -; SSE2-NEXT: pmuludq %xmm15, %xmm3 -; SSE2-NEXT: pmuludq %xmm11, %xmm15 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,1,3] -; SSE2-NEXT: pmuludq %xmm11, %xmm4 -; SSE2-NEXT: paddq %xmm4, %xmm3 -; SSE2-NEXT: psllq $32, %xmm3 -; SSE2-NEXT: paddq %xmm15, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] +; SSE2-NEXT: pmuludq %xmm3, %xmm5 +; SSE2-NEXT: pmuludq %xmm4, %xmm6 +; SSE2-NEXT: paddq %xmm5, %xmm6 +; SSE2-NEXT: pmuludq %xmm4, %xmm3 +; SSE2-NEXT: psllq $32, %xmm6 +; SSE2-NEXT: paddq %xmm6, %xmm3 ; SSE2-NEXT: retq ; ; SSE41-LABEL: mul_v8i64_sext: diff --git a/llvm/test/CodeGen/X86/trunc-subvector.ll b/llvm/test/CodeGen/X86/trunc-subvector.ll index 88830ee7288..77e67d6e554 100644 --- a/llvm/test/CodeGen/X86/trunc-subvector.ll +++ b/llvm/test/CodeGen/X86/trunc-subvector.ll @@ -41,7 +41,8 @@ define <2 x i32> @test3(<8 x i32> %v) { ; SSE2-LABEL: test3: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; @@ -67,8 +68,8 @@ define <2 x i32> @test3(<8 x i32> %v) { define <2 x i32> @test4(<8 x i32> %v) { ; SSE2-LABEL: test4: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; @@ -93,12 +94,12 @@ define <2 x i32> @test4(<8 x i32> %v) { define <2 x i32> @test5(<8 x i32> %v) { ; SSE2-LABEL: test5: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] ; SSE2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll index fd8e4c41b25..47a958f6acd 100644 --- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll +++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll @@ -2387,8 +2387,8 @@ define <2 x i8> @fptosi_2f32_to_2i8(<2 x float> %a) { ; SSE-LABEL: fptosi_2f32_to_2i8: ; SSE: # %bb.0: ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrad $31, %xmm1 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pcmpgtd %xmm0, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: retq ; @@ -2430,8 +2430,8 @@ define <2 x i16> @fptosi_2f32_to_2i16(<2 x float> %a) { ; SSE-LABEL: fptosi_2f32_to_2i16: ; SSE: # %bb.0: ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrad $31, %xmm1 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pcmpgtd %xmm0, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: retq ; @@ -2557,9 +2557,9 @@ define <2 x i8> @fptosi_2f64_to_2i8(<2 x double> %a) { ; SSE-LABEL: fptosi_2f64_to_2i8: ; SSE: # %bb.0: ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: psrad $31, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: retq ; ; VEX-LABEL: fptosi_2f64_to_2i8: @@ -2600,9 +2600,9 @@ define <2 x i16> @fptosi_2f64_to_2i16(<2 x double> %a) { ; SSE-LABEL: fptosi_2f64_to_2i16: ; SSE: # %bb.0: ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: psrad $31, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: retq ; ; VEX-LABEL: fptosi_2f64_to_2i16: diff --git a/llvm/test/CodeGen/X86/vector-sext-widen.ll b/llvm/test/CodeGen/X86/vector-sext-widen.ll index 6d245a223ea..eec82bc9e0f 100644 --- a/llvm/test/CodeGen/X86/vector-sext-widen.ll +++ b/llvm/test/CodeGen/X86/vector-sext-widen.ll @@ -446,8 +446,8 @@ define <2 x i64> @sext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; SSE2-NEXT: psrad $24, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq @@ -456,8 +456,8 @@ define <2 x i64> @sext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp ; SSSE3: # %bb.0: # %entry ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 ; SSSE3-NEXT: psrad $24, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: retq @@ -476,8 +476,8 @@ define <2 x i64> @sext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp ; X32-SSE2: # %bb.0: # %entry ; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: pxor %xmm1, %xmm1 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; X32-SSE2-NEXT: psrad $24, %xmm0 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X32-SSE2-NEXT: retl @@ -498,8 +498,8 @@ define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: psrad $24, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] @@ -510,8 +510,8 @@ define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSSE3-NEXT: psrad $24, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] @@ -548,8 +548,8 @@ define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp ; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; X32-SSE2-NEXT: psrad $24, %xmm1 -; X32-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: pxor %xmm2, %xmm2 +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; X32-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] @@ -574,18 +574,18 @@ define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: psrad $24, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; SSE2-NEXT: psrad $24, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 ; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; SSE2-NEXT: movdqa %xmm4, %xmm0 ; SSE2-NEXT: retq ; @@ -594,18 +594,18 @@ define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSSE3-NEXT: psrad $24, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 ; SSSE3-NEXT: movdqa %xmm1, %xmm4 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; SSSE3-NEXT: psrad $24, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 ; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; SSSE3-NEXT: movdqa %xmm4, %xmm0 ; SSSE3-NEXT: retq ; @@ -654,18 +654,18 @@ define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp ; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; X32-SSE2-NEXT: psrad $24, %xmm1 -; X32-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: pxor %xmm5, %xmm5 +; X32-SSE2-NEXT: pxor %xmm2, %xmm2 +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; X32-SSE2-NEXT: movdqa %xmm1, %xmm4 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; X32-SSE2-NEXT: psrad $24, %xmm3 -; X32-SSE2-NEXT: movdqa %xmm3, %xmm0 -; X32-SSE2-NEXT: psrad $31, %xmm0 +; X32-SSE2-NEXT: pcmpgtd %xmm3, %xmm5 ; X32-SSE2-NEXT: movdqa %xmm3, %xmm2 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; X32-SSE2-NEXT: movdqa %xmm4, %xmm0 ; X32-SSE2-NEXT: retl ; @@ -894,8 +894,8 @@ define <2 x i64> @sext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp ; SSE2-LABEL: sext_8i16_to_2i64: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq @@ -903,8 +903,8 @@ define <2 x i64> @sext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp ; SSSE3-LABEL: sext_8i16_to_2i64: ; SSSE3: # %bb.0: # %entry ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 ; SSSE3-NEXT: psrad $16, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: retq @@ -922,8 +922,8 @@ define <2 x i64> @sext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp ; X32-SSE2-LABEL: sext_8i16_to_2i64: ; X32-SSE2: # %bb.0: # %entry ; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: pxor %xmm1, %xmm1 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; X32-SSE2-NEXT: psrad $16, %xmm0 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X32-SSE2-NEXT: retl @@ -943,8 +943,8 @@ define <4 x i64> @sext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] @@ -954,8 +954,8 @@ define <4 x i64> @sext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp ; SSSE3: # %bb.0: # %entry ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSSE3-NEXT: psrad $16, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] @@ -991,8 +991,8 @@ define <4 x i64> @sext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp ; X32-SSE2: # %bb.0: # %entry ; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; X32-SSE2-NEXT: psrad $16, %xmm1 -; X32-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: pxor %xmm2, %xmm2 +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; X32-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] @@ -1016,18 +1016,18 @@ define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 ; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; SSE2-NEXT: movdqa %xmm4, %xmm0 ; SSE2-NEXT: retq ; @@ -1035,18 +1035,18 @@ define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp ; SSSE3: # %bb.0: # %entry ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSSE3-NEXT: psrad $16, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 ; SSSE3-NEXT: movdqa %xmm1, %xmm4 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; SSSE3-NEXT: psrad $16, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 ; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; SSSE3-NEXT: movdqa %xmm4, %xmm0 ; SSSE3-NEXT: retq ; @@ -1093,18 +1093,18 @@ define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp ; X32-SSE2: # %bb.0: # %entry ; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; X32-SSE2-NEXT: psrad $16, %xmm1 -; X32-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: pxor %xmm5, %xmm5 +; X32-SSE2-NEXT: pxor %xmm2, %xmm2 +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; X32-SSE2-NEXT: movdqa %xmm1, %xmm4 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; X32-SSE2-NEXT: psrad $16, %xmm3 -; X32-SSE2-NEXT: movdqa %xmm3, %xmm0 -; X32-SSE2-NEXT: psrad $31, %xmm0 +; X32-SSE2-NEXT: pcmpgtd %xmm3, %xmm5 ; X32-SSE2-NEXT: movdqa %xmm3, %xmm2 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; X32-SSE2-NEXT: movdqa %xmm4, %xmm0 ; X32-SSE2-NEXT: retl ; @@ -1127,15 +1127,15 @@ entry: define <2 x i64> @sext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp { ; SSE2-LABEL: sext_4i32_to_2i64: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: sext_4i32_to_2i64: ; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: retq ; @@ -1151,8 +1151,8 @@ define <2 x i64> @sext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp ; ; X32-SSE2-LABEL: sext_4i32_to_2i64: ; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: pxor %xmm1, %xmm1 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X32-SSE2-NEXT: retl ; @@ -1169,23 +1169,23 @@ entry: define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp { ; SSE2-LABEL: sext_4i32_to_4i64: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: sext_4i32_to_4i64: ; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSSE3-NEXT: retq ; @@ -1217,12 +1217,12 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp ; ; X32-SSE2-LABEL: sext_4i32_to_4i64: ; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: pxor %xmm2, %xmm2 +; X32-SSE2-NEXT: pxor %xmm3, %xmm3 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 ; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; X32-SSE2-NEXT: retl ; @@ -1242,38 +1242,38 @@ define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp ; SSE2-LABEL: sext_8i32_to_8i64: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: sext_8i32_to_8i64: ; SSSE3: # %bb.0: # %entry ; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSSE3-NEXT: movdqa %xmm1, %xmm3 -; SSSE3-NEXT: psrad $31, %xmm3 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; SSSE3-NEXT: retq ; @@ -1319,19 +1319,19 @@ define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp ; X32-SSE2-LABEL: sext_8i32_to_8i64: ; X32-SSE2: # %bb.0: # %entry ; X32-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE2-NEXT: psrad $31, %xmm3 -; X32-SSE2-NEXT: movdqa %xmm1, %xmm4 -; X32-SSE2-NEXT: psrad $31, %xmm4 +; X32-SSE2-NEXT: pxor %xmm4, %xmm4 +; X32-SSE2-NEXT: pxor %xmm3, %xmm3 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; X32-SSE2-NEXT: pxor %xmm5, %xmm5 +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm5 ; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; X32-SSE2-NEXT: movdqa %xmm1, %xmm3 -; X32-SSE2-NEXT: psrad $31, %xmm3 +; X32-SSE2-NEXT: pxor %xmm3, %xmm3 +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; X32-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; X32-SSE2-NEXT: movdqa %xmm3, %xmm4 -; X32-SSE2-NEXT: psrad $31, %xmm4 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; X32-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; X32-SSE2-NEXT: retl ; @@ -1452,8 +1452,8 @@ define <2 x i64> @load_sext_2i8_to_2i64(<2 x i8> *%ptr) { ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; SSE2-NEXT: psrad $24, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq @@ -1464,8 +1464,8 @@ define <2 x i64> @load_sext_2i8_to_2i64(<2 x i8> *%ptr) { ; SSSE3-NEXT: movd %eax, %xmm0 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 ; SSSE3-NEXT: psrad $24, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: retq @@ -1487,8 +1487,8 @@ define <2 x i64> @load_sext_2i8_to_2i64(<2 x i8> *%ptr) { ; X32-SSE2-NEXT: movd %eax, %xmm0 ; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: pxor %xmm1, %xmm1 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; X32-SSE2-NEXT: psrad $24, %xmm0 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X32-SSE2-NEXT: retl @@ -1980,8 +1980,8 @@ define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) { ; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; X32-SSE2-NEXT: psrad $24, %xmm1 -; X32-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: pxor %xmm2, %xmm2 +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; X32-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] @@ -2049,8 +2049,8 @@ define <2 x i64> @load_sext_4i8_to_4i64_extract(<4 x i8> *%ptr) { ; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; X32-SSE2-NEXT: psrad $24, %xmm0 -; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: pxor %xmm1, %xmm1 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X32-SSE2-NEXT: retl ; @@ -2487,15 +2487,15 @@ define <8 x i64> @load_sext_8i8_to_8i64(<8 x i8> *%ptr) { ; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; X32-SSE2-NEXT: psrad $24, %xmm1 -; X32-SSE2-NEXT: movdqa %xmm1, %xmm3 -; X32-SSE2-NEXT: psrad $31, %xmm3 +; X32-SSE2-NEXT: pxor %xmm4, %xmm4 +; X32-SSE2-NEXT: pxor %xmm3, %xmm3 +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 ; X32-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; X32-SSE2-NEXT: psrad $24, %xmm3 -; X32-SSE2-NEXT: movdqa %xmm3, %xmm4 -; X32-SSE2-NEXT: psrad $31, %xmm4 +; X32-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 ; X32-SSE2-NEXT: movdqa %xmm3, %xmm2 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] @@ -5238,8 +5238,8 @@ define <2 x i64> @load_sext_2i16_to_2i64(<2 x i16> *%ptr) { ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq @@ -5248,8 +5248,8 @@ define <2 x i64> @load_sext_2i16_to_2i64(<2 x i16> *%ptr) { ; SSSE3: # %bb.0: # %entry ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 ; SSSE3-NEXT: psrad $16, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: retq @@ -5269,8 +5269,8 @@ define <2 x i64> @load_sext_2i16_to_2i64(<2 x i16> *%ptr) { ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: pxor %xmm1, %xmm1 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; X32-SSE2-NEXT: psrad $16, %xmm0 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X32-SSE2-NEXT: retl @@ -5336,8 +5336,8 @@ define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) { ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] @@ -5348,8 +5348,8 @@ define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) { ; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSSE3-NEXT: psrad $16, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] @@ -5384,8 +5384,8 @@ define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) { ; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; X32-SSE2-NEXT: psrad $16, %xmm1 -; X32-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: pxor %xmm2, %xmm2 +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; X32-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] @@ -5471,16 +5471,16 @@ define <2 x i64> @load_sext_2i32_to_2i64(<2 x i32> *%ptr) { ; SSE2-LABEL: load_sext_2i32_to_2i64: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_sext_2i32_to_2i64: ; SSSE3: # %bb.0: # %entry ; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: retq ; @@ -5498,8 +5498,8 @@ define <2 x i64> @load_sext_2i32_to_2i64(<2 x i32> *%ptr) { ; X32-SSE2: # %bb.0: # %entry ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: pxor %xmm1, %xmm1 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X32-SSE2-NEXT: retl ; @@ -5518,24 +5518,24 @@ define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) { ; SSE2-LABEL: load_sext_4i32_to_4i64: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_sext_4i32_to_4i64: ; SSSE3: # %bb.0: # %entry ; SSSE3-NEXT: movdqa (%rdi), %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSSE3-NEXT: retq ; @@ -5566,12 +5566,12 @@ define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) { ; X32-SSE2: # %bb.0: # %entry ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE2-NEXT: movdqa (%eax), %xmm0 -; X32-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: pxor %xmm2, %xmm2 +; X32-SSE2-NEXT: pxor %xmm3, %xmm3 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 ; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; X32-SSE2-NEXT: retl ; @@ -5638,12 +5638,12 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { ; SSE2: # %bb.0: ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: retq ; @@ -5651,12 +5651,12 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { ; SSSE3: # %bb.0: ; SSSE3-NEXT: pslld $31, %xmm0 ; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSSE3-NEXT: retq ; @@ -5698,12 +5698,12 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { ; X32-SSE2: # %bb.0: ; X32-SSE2-NEXT: pslld $31, %xmm0 ; X32-SSE2-NEXT: psrad $31, %xmm0 -; X32-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: pxor %xmm2, %xmm2 +; X32-SSE2-NEXT: pxor %xmm3, %xmm3 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 ; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; X32-SSE2-NEXT: retl ; @@ -5726,8 +5726,8 @@ define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) { ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: psrad $24, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] @@ -5738,8 +5738,8 @@ define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) { ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSSE3-NEXT: psrad $24, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] @@ -5776,8 +5776,8 @@ define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) { ; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; X32-SSE2-NEXT: psrad $24, %xmm1 -; X32-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: pxor %xmm2, %xmm2 +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; X32-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll index 096952c7c08..d38f5f3f36c 100644 --- a/llvm/test/CodeGen/X86/vector-sext.ll +++ b/llvm/test/CodeGen/X86/vector-sext.ll @@ -446,8 +446,8 @@ define <2 x i64> @sext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; SSE2-NEXT: psrad $24, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq @@ -456,8 +456,8 @@ define <2 x i64> @sext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp ; SSSE3: # %bb.0: # %entry ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 ; SSSE3-NEXT: psrad $24, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: retq @@ -476,8 +476,8 @@ define <2 x i64> @sext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp ; X32-SSE2: # %bb.0: # %entry ; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: pxor %xmm1, %xmm1 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; X32-SSE2-NEXT: psrad $24, %xmm0 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X32-SSE2-NEXT: retl @@ -497,16 +497,16 @@ define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 ; SSE2-NEXT: psrad $24, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 ; SSE2-NEXT: psrad $24, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; @@ -514,16 +514,16 @@ define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp ; SSSE3: # %bb.0: # %entry ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 ; SSSE3-NEXT: psrad $24, %xmm2 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 ; SSSE3-NEXT: psrad $24, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSSE3-NEXT: movdqa %xmm2, %xmm0 ; SSSE3-NEXT: retq ; @@ -557,16 +557,16 @@ define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp ; X32-SSE2: # %bb.0: # %entry ; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; X32-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: pxor %xmm3, %xmm3 +; X32-SSE2-NEXT: pxor %xmm1, %xmm1 +; X32-SSE2-NEXT: pcmpgtd %xmm2, %xmm1 ; X32-SSE2-NEXT: psrad $24, %xmm2 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X32-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X32-SSE2-NEXT: psrad $31, %xmm0 +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 ; X32-SSE2-NEXT: psrad $24, %xmm1 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; X32-SSE2-NEXT: movdqa %xmm2, %xmm0 ; X32-SSE2-NEXT: retl ; @@ -586,62 +586,62 @@ entry: define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp { ; SSE2-LABEL: sext_16i8_to_8i64: ; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: psrad $24, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: psrad $24, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 ; SSE2-NEXT: psrad $24, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 ; SSE2-NEXT: psrad $24, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: sext_16i8_to_8i64: ; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSSE3-NEXT: movdqa %xmm4, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: psrad $24, %xmm4 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 +; SSSE3-NEXT: psrad $24, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 ; SSSE3-NEXT: psrad $24, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5 ; SSSE3-NEXT: psrad $24, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 ; SSSE3-NEXT: psrad $24, %xmm3 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: sext_16i8_to_8i64: @@ -686,32 +686,32 @@ define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp ; ; X32-SSE2-LABEL: sext_16i8_to_8i64: ; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; X32-SSE2-NEXT: movdqa %xmm4, %xmm1 -; X32-SSE2-NEXT: psrad $31, %xmm1 -; X32-SSE2-NEXT: psrad $24, %xmm4 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X32-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X32-SSE2-NEXT: psrad $31, %xmm0 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X32-SSE2-NEXT: pxor %xmm4, %xmm4 +; X32-SSE2-NEXT: pxor %xmm2, %xmm2 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X32-SSE2-NEXT: psrad $24, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: pxor %xmm2, %xmm2 +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; X32-SSE2-NEXT: psrad $24, %xmm1 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; X32-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X32-SSE2-NEXT: psrad $31, %xmm0 +; X32-SSE2-NEXT: pxor %xmm5, %xmm5 +; X32-SSE2-NEXT: pcmpgtd %xmm2, %xmm5 ; X32-SSE2-NEXT: psrad $24, %xmm2 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; X32-SSE2-NEXT: movdqa %xmm3, %xmm0 -; X32-SSE2-NEXT: psrad $31, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 ; X32-SSE2-NEXT: psrad $24, %xmm3 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; X32-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; X32-SSE2-NEXT: retl ; ; X32-SSE41-LABEL: sext_16i8_to_8i64: @@ -939,8 +939,8 @@ define <2 x i64> @sext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp ; SSE2-LABEL: sext_8i16_to_2i64: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq @@ -948,8 +948,8 @@ define <2 x i64> @sext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp ; SSSE3-LABEL: sext_8i16_to_2i64: ; SSSE3: # %bb.0: # %entry ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 ; SSSE3-NEXT: psrad $16, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: retq @@ -967,8 +967,8 @@ define <2 x i64> @sext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp ; X32-SSE2-LABEL: sext_8i16_to_2i64: ; X32-SSE2: # %bb.0: # %entry ; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: pxor %xmm1, %xmm1 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; X32-SSE2-NEXT: psrad $16, %xmm0 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X32-SSE2-NEXT: retl @@ -987,30 +987,30 @@ define <4 x i64> @sext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp ; SSE2-LABEL: sext_8i16_to_4i64: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 ; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 ; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: sext_8i16_to_4i64: ; SSSE3: # %bb.0: # %entry ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 ; SSSE3-NEXT: psrad $16, %xmm2 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 ; SSSE3-NEXT: psrad $16, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSSE3-NEXT: movdqa %xmm2, %xmm0 ; SSSE3-NEXT: retq ; @@ -1043,15 +1043,15 @@ define <4 x i64> @sext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp ; X32-SSE2-LABEL: sext_8i16_to_4i64: ; X32-SSE2: # %bb.0: # %entry ; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; X32-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: pxor %xmm3, %xmm3 +; X32-SSE2-NEXT: pxor %xmm1, %xmm1 +; X32-SSE2-NEXT: pcmpgtd %xmm2, %xmm1 ; X32-SSE2-NEXT: psrad $16, %xmm2 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; X32-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] -; X32-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X32-SSE2-NEXT: psrad $31, %xmm0 +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 ; X32-SSE2-NEXT: psrad $16, %xmm1 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; X32-SSE2-NEXT: movdqa %xmm2, %xmm0 ; X32-SSE2-NEXT: retl ; @@ -1072,52 +1072,52 @@ define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp ; SSE2-LABEL: sext_8i16_to_8i64: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm1 ; SSE2-NEXT: psrad $16, %xmm4 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 ; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 ; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 ; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] ; SSE2-NEXT: movdqa %xmm4, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: sext_8i16_to_8i64: ; SSSE3: # %bb.0: # %entry ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSSE3-NEXT: movdqa %xmm4, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm1 ; SSSE3-NEXT: psrad $16, %xmm4 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 ; SSSE3-NEXT: psrad $16, %xmm2 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: movdqa %xmm1, %xmm3 -; SSSE3-NEXT: psrad $31, %xmm3 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 ; SSSE3-NEXT: psrad $16, %xmm1 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 ; SSSE3-NEXT: psrad $16, %xmm3 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] ; SSSE3-NEXT: movdqa %xmm4, %xmm0 ; SSSE3-NEXT: retq ; @@ -1163,26 +1163,26 @@ define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp ; X32-SSE2-LABEL: sext_8i16_to_8i64: ; X32-SSE2: # %bb.0: # %entry ; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; X32-SSE2-NEXT: movdqa %xmm4, %xmm1 -; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: pxor %xmm5, %xmm5 +; X32-SSE2-NEXT: pxor %xmm1, %xmm1 +; X32-SSE2-NEXT: pcmpgtd %xmm4, %xmm1 ; X32-SSE2-NEXT: psrad $16, %xmm4 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X32-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: pxor %xmm1, %xmm1 +; X32-SSE2-NEXT: pcmpgtd %xmm2, %xmm1 ; X32-SSE2-NEXT: psrad $16, %xmm2 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; X32-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] -; X32-SSE2-NEXT: movdqa %xmm1, %xmm3 -; X32-SSE2-NEXT: psrad $31, %xmm3 +; X32-SSE2-NEXT: pxor %xmm3, %xmm3 +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 ; X32-SSE2-NEXT: psrad $16, %xmm1 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; X32-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7] -; X32-SSE2-NEXT: movdqa %xmm3, %xmm0 -; X32-SSE2-NEXT: psrad $31, %xmm0 +; X32-SSE2-NEXT: pcmpgtd %xmm3, %xmm5 ; X32-SSE2-NEXT: psrad $16, %xmm3 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] ; X32-SSE2-NEXT: movdqa %xmm4, %xmm0 ; X32-SSE2-NEXT: retl ; @@ -1205,15 +1205,15 @@ entry: define <2 x i64> @sext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp { ; SSE2-LABEL: sext_4i32_to_2i64: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: sext_4i32_to_2i64: ; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: retq ; @@ -1229,8 +1229,8 @@ define <2 x i64> @sext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp ; ; X32-SSE2-LABEL: sext_4i32_to_2i64: ; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: pxor %xmm1, %xmm1 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X32-SSE2-NEXT: retl ; @@ -1247,23 +1247,23 @@ entry: define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp { ; SSE2-LABEL: sext_4i32_to_4i64: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: sext_4i32_to_4i64: ; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSSE3-NEXT: retq ; @@ -1295,12 +1295,12 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp ; ; X32-SSE2-LABEL: sext_4i32_to_4i64: ; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: pxor %xmm2, %xmm2 +; X32-SSE2-NEXT: pxor %xmm3, %xmm3 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 ; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; X32-SSE2-NEXT: retl ; @@ -1320,38 +1320,38 @@ define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp ; SSE2-LABEL: sext_8i32_to_8i64: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: sext_8i32_to_8i64: ; SSSE3: # %bb.0: # %entry ; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSSE3-NEXT: movdqa %xmm1, %xmm3 -; SSSE3-NEXT: psrad $31, %xmm3 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; SSSE3-NEXT: retq ; @@ -1397,19 +1397,19 @@ define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp ; X32-SSE2-LABEL: sext_8i32_to_8i64: ; X32-SSE2: # %bb.0: # %entry ; X32-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE2-NEXT: psrad $31, %xmm3 -; X32-SSE2-NEXT: movdqa %xmm1, %xmm4 -; X32-SSE2-NEXT: psrad $31, %xmm4 +; X32-SSE2-NEXT: pxor %xmm4, %xmm4 +; X32-SSE2-NEXT: pxor %xmm3, %xmm3 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; X32-SSE2-NEXT: pxor %xmm5, %xmm5 +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm5 ; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; X32-SSE2-NEXT: movdqa %xmm1, %xmm3 -; X32-SSE2-NEXT: psrad $31, %xmm3 +; X32-SSE2-NEXT: pxor %xmm3, %xmm3 +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; X32-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; X32-SSE2-NEXT: movdqa %xmm3, %xmm4 -; X32-SSE2-NEXT: psrad $31, %xmm4 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; X32-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; X32-SSE2-NEXT: retl ; @@ -1530,8 +1530,8 @@ define <2 x i64> @load_sext_2i8_to_2i64(<2 x i8> *%ptr) { ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; SSE2-NEXT: psrad $24, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq @@ -1542,8 +1542,8 @@ define <2 x i64> @load_sext_2i8_to_2i64(<2 x i8> *%ptr) { ; SSSE3-NEXT: movd %eax, %xmm0 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 ; SSSE3-NEXT: psrad $24, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: retq @@ -1565,8 +1565,8 @@ define <2 x i64> @load_sext_2i8_to_2i64(<2 x i8> *%ptr) { ; X32-SSE2-NEXT: movd %eax, %xmm0 ; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: pxor %xmm1, %xmm1 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; X32-SSE2-NEXT: psrad $24, %xmm0 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X32-SSE2-NEXT: retl @@ -5377,8 +5377,8 @@ define <2 x i64> @load_sext_2i16_to_2i64(<2 x i16> *%ptr) { ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq @@ -5387,8 +5387,8 @@ define <2 x i64> @load_sext_2i16_to_2i64(<2 x i16> *%ptr) { ; SSSE3: # %bb.0: # %entry ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 ; SSSE3-NEXT: psrad $16, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: retq @@ -5408,8 +5408,8 @@ define <2 x i64> @load_sext_2i16_to_2i64(<2 x i16> *%ptr) { ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: pxor %xmm1, %xmm1 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; X32-SSE2-NEXT: psrad $16, %xmm0 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X32-SSE2-NEXT: retl @@ -5633,16 +5633,16 @@ define <2 x i64> @load_sext_2i32_to_2i64(<2 x i32> *%ptr) { ; SSE2-LABEL: load_sext_2i32_to_2i64: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_sext_2i32_to_2i64: ; SSSE3: # %bb.0: # %entry ; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: retq ; @@ -5660,8 +5660,8 @@ define <2 x i64> @load_sext_2i32_to_2i64(<2 x i32> *%ptr) { ; X32-SSE2: # %bb.0: # %entry ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE2-NEXT: psrad $31, %xmm1 +; X32-SSE2-NEXT: pxor %xmm1, %xmm1 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X32-SSE2-NEXT: retl ; @@ -5680,24 +5680,24 @@ define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) { ; SSE2-LABEL: load_sext_4i32_to_4i64: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_sext_4i32_to_4i64: ; SSSE3: # %bb.0: # %entry ; SSSE3-NEXT: movdqa (%rdi), %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSSE3-NEXT: retq ; @@ -5728,12 +5728,12 @@ define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) { ; X32-SSE2: # %bb.0: # %entry ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE2-NEXT: movdqa (%eax), %xmm0 -; X32-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: pxor %xmm2, %xmm2 +; X32-SSE2-NEXT: pxor %xmm3, %xmm3 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 ; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; X32-SSE2-NEXT: retl ; @@ -5808,12 +5808,12 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { ; SSE2: # %bb.0: ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: retq ; @@ -5821,12 +5821,12 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { ; SSSE3: # %bb.0: ; SSSE3-NEXT: pslld $31, %xmm0 ; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSSE3-NEXT: retq ; @@ -5868,12 +5868,12 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { ; X32-SSE2: # %bb.0: ; X32-SSE2-NEXT: pslld $31, %xmm0 ; X32-SSE2-NEXT: psrad $31, %xmm0 -; X32-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: pxor %xmm2, %xmm2 +; X32-SSE2-NEXT: pxor %xmm3, %xmm3 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 ; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; X32-SSE2-NEXT: retl ; @@ -5895,12 +5895,12 @@ define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) { ; SSE2: # %bb.0: ; SSE2-NEXT: pslld $24, %xmm0 ; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: retq ; @@ -5908,12 +5908,12 @@ define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) { ; SSSE3: # %bb.0: ; SSSE3-NEXT: pslld $24, %xmm0 ; SSSE3-NEXT: psrad $24, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSSE3-NEXT: retq ; @@ -5955,12 +5955,12 @@ define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) { ; X32-SSE2: # %bb.0: ; X32-SSE2-NEXT: pslld $24, %xmm0 ; X32-SSE2-NEXT: psrad $24, %xmm0 -; X32-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: pxor %xmm2, %xmm2 +; X32-SSE2-NEXT: pxor %xmm3, %xmm3 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 ; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE2-NEXT: psrad $31, %xmm2 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; X32-SSE2-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/vector-trunc-math-widen.ll b/llvm/test/CodeGen/X86/vector-trunc-math-widen.ll index fb5407b165f..cbff28d4b76 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-math-widen.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math-widen.ll @@ -5569,39 +5569,40 @@ define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwi define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { ; SSE-LABEL: mul_add_self_v4i64_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: psrad $31, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: psrad $31, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,0,1] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: psrad $31, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: psrad $31, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; SSE-NEXT: pmuludq %xmm0, %xmm2 -; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,3] -; SSE-NEXT: pmuludq %xmm1, %xmm5 -; SSE-NEXT: paddq %xmm5, %xmm2 -; SSE-NEXT: psllq $32, %xmm2 -; SSE-NEXT: paddq %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,1,3] -; SSE-NEXT: pmuludq %xmm3, %xmm0 -; SSE-NEXT: pmuludq %xmm6, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,1,3] -; SSE-NEXT: pmuludq %xmm6, %xmm1 -; SSE-NEXT: paddq %xmm1, %xmm0 -; SSE-NEXT: psllq $32, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[0,2] -; SSE-NEXT: paddd %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE-NEXT: pxor %xmm8, %xmm8 +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pxor %xmm7, %xmm7 +; SSE-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; SSE-NEXT: pxor %xmm5, %xmm5 +; SSE-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: pmuludq %xmm1, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] +; SSE-NEXT: pmuludq %xmm0, %xmm5 +; SSE-NEXT: paddq %xmm7, %xmm5 +; SSE-NEXT: psllq $32, %xmm5 +; SSE-NEXT: pmuludq %xmm0, %xmm1 +; SSE-NEXT: paddq %xmm5, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; SSE-NEXT: pmuludq %xmm4, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] +; SSE-NEXT: pmuludq %xmm2, %xmm6 +; SSE-NEXT: paddq %xmm3, %xmm6 +; SSE-NEXT: psllq $32, %xmm6 +; SSE-NEXT: pmuludq %xmm2, %xmm4 +; SSE-NEXT: paddq %xmm6, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] +; SSE-NEXT: paddd %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: mul_add_self_v4i64_v4i32: diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll index c8ccab02614..e3b3ee4bb22 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -5569,39 +5569,40 @@ define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwi define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { ; SSE-LABEL: mul_add_self_v4i64_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: psrad $31, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: psrad $31, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,0,1] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: psrad $31, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: psrad $31, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; SSE-NEXT: pmuludq %xmm0, %xmm2 -; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,3] -; SSE-NEXT: pmuludq %xmm1, %xmm5 -; SSE-NEXT: paddq %xmm5, %xmm2 -; SSE-NEXT: psllq $32, %xmm2 -; SSE-NEXT: paddq %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,1,3] -; SSE-NEXT: pmuludq %xmm3, %xmm0 -; SSE-NEXT: pmuludq %xmm6, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,1,3] -; SSE-NEXT: pmuludq %xmm6, %xmm1 -; SSE-NEXT: paddq %xmm1, %xmm0 -; SSE-NEXT: psllq $32, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[0,2] -; SSE-NEXT: paddd %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE-NEXT: pxor %xmm8, %xmm8 +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pxor %xmm7, %xmm7 +; SSE-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; SSE-NEXT: pxor %xmm5, %xmm5 +; SSE-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: pmuludq %xmm1, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] +; SSE-NEXT: pmuludq %xmm0, %xmm5 +; SSE-NEXT: paddq %xmm7, %xmm5 +; SSE-NEXT: psllq $32, %xmm5 +; SSE-NEXT: pmuludq %xmm0, %xmm1 +; SSE-NEXT: paddq %xmm5, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; SSE-NEXT: pmuludq %xmm4, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] +; SSE-NEXT: pmuludq %xmm2, %xmm6 +; SSE-NEXT: paddq %xmm3, %xmm6 +; SSE-NEXT: psllq $32, %xmm6 +; SSE-NEXT: pmuludq %xmm2, %xmm4 +; SSE-NEXT: paddq %xmm6, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] +; SSE-NEXT: paddd %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: mul_add_self_v4i64_v4i32: |