diff options
| author | Craig Topper <craig.topper@intel.com> | 2018-02-10 17:58:58 +0000 |
|---|---|---|
| committer | Craig Topper <craig.topper@intel.com> | 2018-02-10 17:58:58 +0000 |
| commit | 28d3a73c810714fb6da680c5ee0d7e51c9e22d7a (patch) | |
| tree | cec1d9539170d0bc447fb6f47536fc6b4f6ce44a /llvm | |
| parent | 0fbdaa6d816723339e32af0ffa2c10b5abb363a0 (diff) | |
| download | bcm5719-llvm-28d3a73c810714fb6da680c5ee0d7e51c9e22d7a.tar.gz bcm5719-llvm-28d3a73c810714fb6da680c5ee0d7e51c9e22d7a.zip | |
[X86] Extend inputs with elements smaller than i32 to sint_to_fp/uint_to_fp before type legalization.
This prevents extends of masks being introduced during lowering where it become difficult to combine them out.
There are a few oddities in here.
We sometimes concatenate two k-registers produced by two compares, sign_extend the combined pair, then extract two halves. This worked better previously because the sign_extend wasn't created until after the fp_to_sint was split which led to a split sign_extend being created.
We probably also need to custom type legalize (v2i32 (sext v2i1)) via widening.
llvm-svn: 324820
Diffstat (limited to 'llvm')
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 9 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512-cvt.ll | 530 | ||||
| -rwxr-xr-x | llvm/test/CodeGen/X86/avx512-schedule.ll | 142 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/sse-fsignum.ll | 164 |
4 files changed, 247 insertions, 598 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 1b316fe746c..ed1c865ad07 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -36836,11 +36836,11 @@ static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, SDValue Op0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT InVT = Op0.getValueType(); - EVT InSVT = InVT.getScalarType(); + // UINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32)) // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32)) // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32)) - if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) { + if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) { SDLoc dl(N); EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, InVT.getVectorNumElements()); @@ -36870,14 +36870,11 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, SDValue Op0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT InVT = Op0.getValueType(); - EVT InSVT = InVT.getScalarType(); // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32)) // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32)) // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32)) - if (InVT.isVector() && - (InSVT == MVT::i8 || InSVT == MVT::i16 || - (InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) { + if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) { SDLoc dl(N); EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, InVT.getVectorNumElements()); diff --git a/llvm/test/CodeGen/X86/avx512-cvt.ll b/llvm/test/CodeGen/X86/avx512-cvt.ll index 892754f3b38..bc44b273189 100644 --- a/llvm/test/CodeGen/X86/avx512-cvt.ll +++ b/llvm/test/CodeGen/X86/avx512-cvt.ll @@ -1527,45 +1527,48 @@ define <16 x double> @sbto16f64(<16 x double> %a) { ; NOVLDQ-LABEL: sbto16f64: ; NOVLDQ: # %bb.0: ; NOVLDQ-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; NOVLDQ-NEXT: vcmpltpd %zmm0, %zmm2, %k0 ; NOVLDQ-NEXT: vcmpltpd %zmm1, %zmm2, %k1 -; NOVLDQ-NEXT: vcmpltpd %zmm0, %zmm2, %k2 -; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; NOVLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0 +; NOVLDQ-NEXT: kunpckbw %k0, %k1, %k1 ; NOVLDQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NOVLDQ-NEXT: vcvtdq2pd %ymm1, %zmm0 +; NOVLDQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; NOVLDQ-NEXT: vcvtdq2pd %ymm1, %zmm1 ; NOVLDQ-NEXT: retq ; ; VLDQ-LABEL: sbto16f64: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; VLDQ-NEXT: vcmpltpd %zmm1, %zmm2, %k0 -; VLDQ-NEXT: vcmpltpd %zmm0, %zmm2, %k1 -; VLDQ-NEXT: vpmovm2d %k1, %ymm0 -; VLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0 -; VLDQ-NEXT: vpmovm2d %k0, %ymm1 +; VLDQ-NEXT: vcmpltpd %zmm0, %zmm2, %k0 +; VLDQ-NEXT: vcmpltpd %zmm1, %zmm2, %k1 +; VLDQ-NEXT: kunpckbw %k0, %k1, %k0 +; VLDQ-NEXT: vpmovm2d %k0, %zmm1 +; VLDQ-NEXT: vcvtdq2pd %ymm1, %zmm0 +; VLDQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; VLDQ-NEXT: vcvtdq2pd %ymm1, %zmm1 ; VLDQ-NEXT: retq ; ; VLNODQ-LABEL: sbto16f64: ; VLNODQ: # %bb.0: ; VLNODQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; VLNODQ-NEXT: vcmpltpd %zmm0, %zmm2, %k0 ; VLNODQ-NEXT: vcmpltpd %zmm1, %zmm2, %k1 -; VLNODQ-NEXT: vcmpltpd %zmm0, %zmm2, %k2 -; VLNODQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; VLNODQ-NEXT: vmovdqa32 %ymm1, %ymm0 {%k2} {z} -; VLNODQ-NEXT: vcvtdq2pd %ymm0, %zmm0 -; VLNODQ-NEXT: vmovdqa32 %ymm1, %ymm1 {%k1} {z} +; VLNODQ-NEXT: kunpckbw %k0, %k1, %k1 +; VLNODQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; VLNODQ-NEXT: vcvtdq2pd %ymm1, %zmm0 +; VLNODQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; VLNODQ-NEXT: vcvtdq2pd %ymm1, %zmm1 ; VLNODQ-NEXT: retq ; ; DQNOVL-LABEL: sbto16f64: ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; DQNOVL-NEXT: vcmpltpd %zmm1, %zmm2, %k0 -; DQNOVL-NEXT: vcmpltpd %zmm0, %zmm2, %k1 -; DQNOVL-NEXT: vpmovm2d %k1, %zmm0 -; DQNOVL-NEXT: vcvtdq2pd %ymm0, %zmm0 +; DQNOVL-NEXT: vcmpltpd %zmm0, %zmm2, %k0 +; DQNOVL-NEXT: vcmpltpd %zmm1, %zmm2, %k1 +; DQNOVL-NEXT: kunpckbw %k0, %k1, %k0 ; DQNOVL-NEXT: vpmovm2d %k0, %zmm1 +; DQNOVL-NEXT: vcvtdq2pd %ymm1, %zmm0 +; DQNOVL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; DQNOVL-NEXT: vcvtdq2pd %ymm1, %zmm1 ; DQNOVL-NEXT: retq %cmpres = fcmp ogt <16 x double> %a, zeroinitializer @@ -1612,96 +1615,65 @@ define <8 x double> @sbto8f64(<8 x double> %a) { } define <8 x float> @sbto8f32(<8 x float> %a) { -; NOVLDQ-LABEL: sbto8f32: -; NOVLDQ: # %bb.0: -; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; NOVLDQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; NOVLDQ-NEXT: vcmpltps %zmm0, %zmm1, %k1 -; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; NOVLDQ-NEXT: vcvtdq2ps %ymm0, %ymm0 -; NOVLDQ-NEXT: retq +; NOVL-LABEL: sbto8f32: +; NOVL: # %bb.0: +; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; NOVL-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 +; NOVL-NEXT: vcvtdq2ps %ymm0, %ymm0 +; NOVL-NEXT: retq ; ; VLDQ-LABEL: sbto8f32: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; VLDQ-NEXT: vcmpltps %ymm0, %ymm1, %k0 -; VLDQ-NEXT: vpmovm2d %k0, %ymm0 +; VLDQ-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; VLDQ-NEXT: vcvtdq2ps %ymm0, %ymm0 ; VLDQ-NEXT: retq ; ; VLNODQ-LABEL: sbto8f32: ; VLNODQ: # %bb.0: ; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VLNODQ-NEXT: vcmpltps %ymm0, %ymm1, %k1 -; VLNODQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; VLNODQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; VLNODQ-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; VLNODQ-NEXT: vcvtdq2ps %ymm0, %ymm0 ; VLNODQ-NEXT: retq -; -; DQNOVL-LABEL: sbto8f32: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; DQNOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; DQNOVL-NEXT: vcmpltps %zmm0, %zmm1, %k0 -; DQNOVL-NEXT: vpmovm2d %k0, %zmm0 -; DQNOVL-NEXT: vcvtdq2ps %ymm0, %ymm0 -; DQNOVL-NEXT: retq %cmpres = fcmp ogt <8 x float> %a, zeroinitializer %1 = sitofp <8 x i1> %cmpres to <8 x float> ret <8 x float> %1 } define <4 x float> @sbto4f32(<4 x float> %a) { -; NOVLDQ-LABEL: sbto4f32: -; NOVLDQ: # %bb.0: -; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NOVLDQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; NOVLDQ-NEXT: vcmpltps %zmm0, %zmm1, %k1 -; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; NOVLDQ-NEXT: vcvtdq2ps %xmm0, %xmm0 -; NOVLDQ-NEXT: vzeroupper -; NOVLDQ-NEXT: retq +; NOVL-LABEL: sbto4f32: +; NOVL: # %bb.0: +; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; NOVL-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 +; NOVL-NEXT: vcvtdq2ps %xmm0, %xmm0 +; NOVL-NEXT: retq ; ; VLDQ-LABEL: sbto4f32: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; VLDQ-NEXT: vcmpltps %xmm0, %xmm1, %k0 -; VLDQ-NEXT: vpmovm2d %k0, %xmm0 +; VLDQ-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 ; VLDQ-NEXT: vcvtdq2ps %xmm0, %xmm0 ; VLDQ-NEXT: retq ; ; VLNODQ-LABEL: sbto4f32: ; VLNODQ: # %bb.0: ; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VLNODQ-NEXT: vcmpltps %xmm0, %xmm1, %k1 -; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; VLNODQ-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 ; VLNODQ-NEXT: vcvtdq2ps %xmm0, %xmm0 ; VLNODQ-NEXT: retq -; -; DQNOVL-LABEL: sbto4f32: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; DQNOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; DQNOVL-NEXT: vcmpltps %zmm0, %zmm1, %k0 -; DQNOVL-NEXT: vpmovm2d %k0, %zmm0 -; DQNOVL-NEXT: vcvtdq2ps %xmm0, %xmm0 -; DQNOVL-NEXT: vzeroupper -; DQNOVL-NEXT: retq %cmpres = fcmp ogt <4 x float> %a, zeroinitializer %1 = sitofp <4 x i1> %cmpres to <4 x float> ret <4 x float> %1 } define <4 x double> @sbto4f64(<4 x double> %a) { -; NOVLDQ-LABEL: sbto4f64: -; NOVLDQ: # %bb.0: -; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; NOVLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; NOVLDQ-NEXT: vcmpltpd %zmm0, %zmm1, %k1 -; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; NOVLDQ-NEXT: vcvtdq2pd %xmm0, %ymm0 -; NOVLDQ-NEXT: retq +; NOVL-LABEL: sbto4f64: +; NOVL: # %bb.0: +; NOVL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; NOVL-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 +; NOVL-NEXT: vpmovqd %zmm0, %ymm0 +; NOVL-NEXT: vcvtdq2pd %xmm0, %ymm0 +; NOVL-NEXT: retq ; ; VLDQ-LABEL: sbto4f64: ; VLDQ: # %bb.0: @@ -1719,36 +1691,25 @@ define <4 x double> @sbto4f64(<4 x double> %a) { ; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; VLNODQ-NEXT: vcvtdq2pd %xmm0, %ymm0 ; VLNODQ-NEXT: retq -; -; DQNOVL-LABEL: sbto4f64: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; DQNOVL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; DQNOVL-NEXT: vcmpltpd %zmm0, %zmm1, %k0 -; DQNOVL-NEXT: vpmovm2d %k0, %zmm0 -; DQNOVL-NEXT: vcvtdq2pd %xmm0, %ymm0 -; DQNOVL-NEXT: retq %cmpres = fcmp ogt <4 x double> %a, zeroinitializer %1 = sitofp <4 x i1> %cmpres to <4 x double> ret <4 x double> %1 } define <2 x float> @sbto2f32(<2 x float> %a) { -; NOVLDQ-LABEL: sbto2f32: -; NOVLDQ: # %bb.0: -; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NOVLDQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; NOVLDQ-NEXT: vcmpltps %zmm0, %zmm1, %k1 -; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; NOVLDQ-NEXT: vcvtdq2ps %xmm0, %xmm0 -; NOVLDQ-NEXT: vzeroupper -; NOVLDQ-NEXT: retq +; NOVL-LABEL: sbto2f32: +; NOVL: # %bb.0: +; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; NOVL-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 +; NOVL-NEXT: vcvtdq2ps %xmm0, %xmm0 +; NOVL-NEXT: retq ; ; VLDQ-LABEL: sbto2f32: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; VLDQ-NEXT: vcmpltps %xmm0, %xmm1, %k0 -; VLDQ-NEXT: vpmovm2d %k0, %xmm0 +; VLDQ-NEXT: vpmovm2q %k0, %xmm0 +; VLDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; VLDQ-NEXT: vcvtdq2ps %xmm0, %xmm0 ; VLDQ-NEXT: retq ; @@ -1757,61 +1718,39 @@ define <2 x float> @sbto2f32(<2 x float> %a) { ; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; VLNODQ-NEXT: vcmpltps %xmm0, %xmm1, %k1 ; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; VLNODQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; VLNODQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; VLNODQ-NEXT: vcvtdq2ps %xmm0, %xmm0 ; VLNODQ-NEXT: retq -; -; DQNOVL-LABEL: sbto2f32: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; DQNOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; DQNOVL-NEXT: vcmpltps %zmm0, %zmm1, %k0 -; DQNOVL-NEXT: vpmovm2d %k0, %zmm0 -; DQNOVL-NEXT: vcvtdq2ps %xmm0, %xmm0 -; DQNOVL-NEXT: vzeroupper -; DQNOVL-NEXT: retq %cmpres = fcmp ogt <2 x float> %a, zeroinitializer %1 = sitofp <2 x i1> %cmpres to <2 x float> ret <2 x float> %1 } define <2 x double> @sbto2f64(<2 x double> %a) { -; NOVLDQ-LABEL: sbto2f64: -; NOVLDQ: # %bb.0: -; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NOVLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; NOVLDQ-NEXT: vcmpltpd %zmm0, %zmm1, %k1 -; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; NOVLDQ-NEXT: vcvtdq2pd %xmm0, %xmm0 -; NOVLDQ-NEXT: vzeroupper -; NOVLDQ-NEXT: retq +; NOVL-LABEL: sbto2f64: +; NOVL: # %bb.0: +; NOVL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; NOVL-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 +; NOVL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; NOVL-NEXT: vcvtdq2pd %xmm0, %xmm0 +; NOVL-NEXT: retq ; ; VLDQ-LABEL: sbto2f64: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; VLDQ-NEXT: vcmpltpd %xmm0, %xmm1, %k0 -; VLDQ-NEXT: vpmovm2d %k0, %xmm0 +; VLDQ-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 +; VLDQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; VLDQ-NEXT: vcvtdq2pd %xmm0, %xmm0 ; VLDQ-NEXT: retq ; ; VLNODQ-LABEL: sbto2f64: ; VLNODQ: # %bb.0: ; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VLNODQ-NEXT: vcmpltpd %xmm0, %xmm1, %k1 -; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; VLNODQ-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 +; VLNODQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; VLNODQ-NEXT: vcvtdq2pd %xmm0, %xmm0 ; VLNODQ-NEXT: retq -; -; DQNOVL-LABEL: sbto2f64: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; DQNOVL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; DQNOVL-NEXT: vcmpltpd %zmm0, %zmm1, %k0 -; DQNOVL-NEXT: vpmovm2d %k0, %zmm0 -; DQNOVL-NEXT: vcvtdq2pd %xmm0, %xmm0 -; DQNOVL-NEXT: vzeroupper -; DQNOVL-NEXT: retq %cmpres = fcmp ogt <2 x double> %a, zeroinitializer %1 = sitofp <2 x i1> %cmpres to <2 x double> ret <2 x double> %1 @@ -1976,54 +1915,34 @@ define <16 x float> @ubto16f32(<16 x i32> %a) { } define <16 x double> @ubto16f64(<16 x i32> %a) { -; NOVLDQ-LABEL: ubto16f64: -; NOVLDQ: # %bb.0: -; NOVLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; NOVLDQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; NOVLDQ-NEXT: vpsrld $31, %ymm0, %ymm0 -; NOVLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0 -; NOVLDQ-NEXT: kshiftrw $8, %k1, %k1 -; NOVLDQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NOVLDQ-NEXT: vpsrld $31, %ymm1, %ymm1 -; NOVLDQ-NEXT: vcvtdq2pd %ymm1, %zmm1 -; NOVLDQ-NEXT: retq +; NODQ-LABEL: ubto16f64: +; NODQ: # %bb.0: +; NODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; NODQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 +; NODQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NODQ-NEXT: vpsrld $31, %zmm0, %zmm1 +; NODQ-NEXT: vcvtdq2pd %ymm1, %zmm0 +; NODQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; NODQ-NEXT: vcvtdq2pd %ymm1, %zmm1 +; NODQ-NEXT: retq ; ; VLDQ-LABEL: ubto16f64: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vpmovd2m %zmm0, %k0 -; VLDQ-NEXT: vpmovm2d %k0, %ymm0 -; VLDQ-NEXT: vpsrld $31, %ymm0, %ymm0 -; VLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0 -; VLDQ-NEXT: kshiftrw $8, %k0, %k0 -; VLDQ-NEXT: vpmovm2d %k0, %ymm1 -; VLDQ-NEXT: vpsrld $31, %ymm1, %ymm1 +; VLDQ-NEXT: vpmovm2d %k0, %zmm0 +; VLDQ-NEXT: vpsrld $31, %zmm0, %zmm1 +; VLDQ-NEXT: vcvtdq2pd %ymm1, %zmm0 +; VLDQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; VLDQ-NEXT: vcvtdq2pd %ymm1, %zmm1 ; VLDQ-NEXT: retq ; -; VLNODQ-LABEL: ubto16f64: -; VLNODQ: # %bb.0: -; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VLNODQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; VLNODQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; VLNODQ-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} {z} -; VLNODQ-NEXT: vpsrld $31, %ymm0, %ymm0 -; VLNODQ-NEXT: vcvtdq2pd %ymm0, %zmm0 -; VLNODQ-NEXT: kshiftrw $8, %k1, %k1 -; VLNODQ-NEXT: vmovdqa32 %ymm1, %ymm1 {%k1} {z} -; VLNODQ-NEXT: vpsrld $31, %ymm1, %ymm1 -; VLNODQ-NEXT: vcvtdq2pd %ymm1, %zmm1 -; VLNODQ-NEXT: retq -; ; DQNOVL-LABEL: ubto16f64: ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: vpmovd2m %zmm0, %k0 ; DQNOVL-NEXT: vpmovm2d %k0, %zmm0 -; DQNOVL-NEXT: vpsrld $31, %ymm0, %ymm0 -; DQNOVL-NEXT: vcvtdq2pd %ymm0, %zmm0 -; DQNOVL-NEXT: kshiftrw $8, %k0, %k0 -; DQNOVL-NEXT: vpmovm2d %k0, %zmm1 -; DQNOVL-NEXT: vpsrld $31, %ymm1, %ymm1 +; DQNOVL-NEXT: vpsrld $31, %zmm0, %zmm1 +; DQNOVL-NEXT: vcvtdq2pd %ymm1, %zmm0 +; DQNOVL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; DQNOVL-NEXT: vcvtdq2pd %ymm1, %zmm1 ; DQNOVL-NEXT: retq %mask = icmp slt <16 x i32> %a, zeroinitializer @@ -2032,268 +1951,95 @@ define <16 x double> @ubto16f64(<16 x i32> %a) { } define <8 x float> @ubto8f32(<8 x i32> %a) { -; NOVLDQ-LABEL: ubto8f32: -; NOVLDQ: # %bb.0: -; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; NOVLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; NOVLDQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; NOVLDQ-NEXT: vpsrld $31, %ymm0, %ymm0 -; NOVLDQ-NEXT: vcvtdq2ps %ymm0, %ymm0 -; NOVLDQ-NEXT: retq -; -; VLDQ-LABEL: ubto8f32: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vpmovd2m %ymm0, %k0 -; VLDQ-NEXT: vpmovm2d %k0, %ymm0 -; VLDQ-NEXT: vpsrld $31, %ymm0, %ymm0 -; VLDQ-NEXT: vcvtdq2ps %ymm0, %ymm0 -; VLDQ-NEXT: retq -; -; VLNODQ-LABEL: ubto8f32: -; VLNODQ: # %bb.0: -; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VLNODQ-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 -; VLNODQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; VLNODQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; VLNODQ-NEXT: vpsrld $31, %ymm0, %ymm0 -; VLNODQ-NEXT: vcvtdq2ps %ymm0, %ymm0 -; VLNODQ-NEXT: retq +; NOVL-LABEL: ubto8f32: +; NOVL: # %bb.0: +; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; NOVL-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 +; NOVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1065353216,1065353216,1065353216,1065353216,1065353216,1065353216,1065353216,1065353216] +; NOVL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; NOVL-NEXT: retq ; -; DQNOVL-LABEL: ubto8f32: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; DQNOVL-NEXT: vpmovd2m %zmm0, %k0 -; DQNOVL-NEXT: vpmovm2d %k0, %zmm0 -; DQNOVL-NEXT: vpsrld $31, %ymm0, %ymm0 -; DQNOVL-NEXT: vcvtdq2ps %ymm0, %ymm0 -; DQNOVL-NEXT: retq +; VL-LABEL: ubto8f32: +; VL: # %bb.0: +; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VL-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 +; VL-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm0, %ymm0 +; VL-NEXT: retq %mask = icmp slt <8 x i32> %a, zeroinitializer %1 = uitofp <8 x i1> %mask to <8 x float> ret <8 x float> %1 } define <8 x double> @ubto8f64(<8 x i32> %a) { -; NOVLDQ-LABEL: ubto8f64: -; NOVLDQ: # %bb.0: -; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; NOVLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; NOVLDQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; NOVLDQ-NEXT: vpsrld $31, %ymm0, %ymm0 -; NOVLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0 -; NOVLDQ-NEXT: retq -; -; VLDQ-LABEL: ubto8f64: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vpmovd2m %ymm0, %k0 -; VLDQ-NEXT: vpmovm2d %k0, %ymm0 -; VLDQ-NEXT: vpsrld $31, %ymm0, %ymm0 -; VLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0 -; VLDQ-NEXT: retq -; -; VLNODQ-LABEL: ubto8f64: -; VLNODQ: # %bb.0: -; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VLNODQ-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 -; VLNODQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; VLNODQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; VLNODQ-NEXT: vpsrld $31, %ymm0, %ymm0 -; VLNODQ-NEXT: vcvtdq2pd %ymm0, %zmm0 -; VLNODQ-NEXT: retq -; -; DQNOVL-LABEL: ubto8f64: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; DQNOVL-NEXT: vpmovd2m %zmm0, %k0 -; DQNOVL-NEXT: vpmovm2d %k0, %zmm0 -; DQNOVL-NEXT: vpsrld $31, %ymm0, %ymm0 -; DQNOVL-NEXT: vcvtdq2pd %ymm0, %zmm0 -; DQNOVL-NEXT: retq +; ALL-LABEL: ubto8f64: +; ALL: # %bb.0: +; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 +; ALL-NEXT: vpsrld $31, %ymm0, %ymm0 +; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 +; ALL-NEXT: retq %mask = icmp slt <8 x i32> %a, zeroinitializer %1 = uitofp <8 x i1> %mask to <8 x double> ret <8 x double> %1 } define <4 x float> @ubto4f32(<4 x i32> %a) { -; NOVLDQ-LABEL: ubto4f32: -; NOVLDQ: # %bb.0: -; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NOVLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; NOVLDQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; NOVLDQ-NEXT: vpsrld $31, %xmm0, %xmm0 -; NOVLDQ-NEXT: vcvtdq2ps %xmm0, %xmm0 -; NOVLDQ-NEXT: vzeroupper -; NOVLDQ-NEXT: retq -; -; VLDQ-LABEL: ubto4f32: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vpmovd2m %xmm0, %k0 -; VLDQ-NEXT: vpmovm2d %k0, %xmm0 -; VLDQ-NEXT: vpsrld $31, %xmm0, %xmm0 -; VLDQ-NEXT: vcvtdq2ps %xmm0, %xmm0 -; VLDQ-NEXT: retq -; -; VLNODQ-LABEL: ubto4f32: -; VLNODQ: # %bb.0: -; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VLNODQ-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 -; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; VLNODQ-NEXT: vpsrld $31, %xmm0, %xmm0 -; VLNODQ-NEXT: vcvtdq2ps %xmm0, %xmm0 -; VLNODQ-NEXT: retq +; NOVL-LABEL: ubto4f32: +; NOVL: # %bb.0: +; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; NOVL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NOVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1065353216,1065353216,1065353216,1065353216] +; NOVL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NOVL-NEXT: retq ; -; DQNOVL-LABEL: ubto4f32: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; DQNOVL-NEXT: vpmovd2m %zmm0, %k0 -; DQNOVL-NEXT: vpmovm2d %k0, %zmm0 -; DQNOVL-NEXT: vpsrld $31, %xmm0, %xmm0 -; DQNOVL-NEXT: vcvtdq2ps %xmm0, %xmm0 -; DQNOVL-NEXT: vzeroupper -; DQNOVL-NEXT: retq +; VL-LABEL: ubto4f32: +; VL: # %bb.0: +; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; VL-NEXT: retq %mask = icmp slt <4 x i32> %a, zeroinitializer %1 = uitofp <4 x i1> %mask to <4 x float> ret <4 x float> %1 } define <4 x double> @ubto4f64(<4 x i32> %a) { -; NOVLDQ-LABEL: ubto4f64: -; NOVLDQ: # %bb.0: -; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NOVLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; NOVLDQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; NOVLDQ-NEXT: vpsrld $31, %xmm0, %xmm0 -; NOVLDQ-NEXT: vcvtdq2pd %xmm0, %ymm0 -; NOVLDQ-NEXT: retq -; -; VLDQ-LABEL: ubto4f64: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vpmovd2m %xmm0, %k0 -; VLDQ-NEXT: vpmovm2d %k0, %xmm0 -; VLDQ-NEXT: vpsrld $31, %xmm0, %xmm0 -; VLDQ-NEXT: vcvtdq2pd %xmm0, %ymm0 -; VLDQ-NEXT: retq -; -; VLNODQ-LABEL: ubto4f64: -; VLNODQ: # %bb.0: -; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VLNODQ-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 -; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; VLNODQ-NEXT: vpsrld $31, %xmm0, %xmm0 -; VLNODQ-NEXT: vcvtdq2pd %xmm0, %ymm0 -; VLNODQ-NEXT: retq -; -; DQNOVL-LABEL: ubto4f64: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; DQNOVL-NEXT: vpmovd2m %zmm0, %k0 -; DQNOVL-NEXT: vpmovm2d %k0, %zmm0 -; DQNOVL-NEXT: vpsrld $31, %xmm0, %xmm0 -; DQNOVL-NEXT: vcvtdq2pd %xmm0, %ymm0 -; DQNOVL-NEXT: retq +; ALL-LABEL: ubto4f64: +; ALL: # %bb.0: +; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; ALL-NEXT: vpsrld $31, %xmm0, %xmm0 +; ALL-NEXT: vcvtdq2pd %xmm0, %ymm0 +; ALL-NEXT: retq %mask = icmp slt <4 x i32> %a, zeroinitializer %1 = uitofp <4 x i1> %mask to <4 x double> ret <4 x double> %1 } define <2 x float> @ubto2f32(<2 x i32> %a) { -; NOVLDQ-LABEL: ubto2f32: -; NOVLDQ: # %bb.0: -; NOVLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; NOVLDQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; NOVLDQ-NEXT: vptestmq %zmm0, %zmm0, %k1 -; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; NOVLDQ-NEXT: vpsrld $31, %xmm0, %xmm0 -; NOVLDQ-NEXT: vcvtdq2ps %xmm0, %xmm0 -; NOVLDQ-NEXT: vzeroupper -; NOVLDQ-NEXT: retq -; -; VLDQ-LABEL: ubto2f32: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VLDQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; VLDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 -; VLDQ-NEXT: vpmovm2d %k0, %xmm0 -; VLDQ-NEXT: vpsrld $31, %xmm0, %xmm0 -; VLDQ-NEXT: vcvtdq2ps %xmm0, %xmm0 -; VLDQ-NEXT: retq -; -; VLNODQ-LABEL: ubto2f32: -; VLNODQ: # %bb.0: -; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VLNODQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; VLNODQ-NEXT: vptestmq %xmm0, %xmm0, %k1 -; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; VLNODQ-NEXT: vpsrld $31, %xmm0, %xmm0 -; VLNODQ-NEXT: vcvtdq2ps %xmm0, %xmm0 -; VLNODQ-NEXT: retq -; -; DQNOVL-LABEL: ubto2f32: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; DQNOVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; DQNOVL-NEXT: vptestmq %zmm0, %zmm0, %k0 -; DQNOVL-NEXT: vpmovm2d %k0, %zmm0 -; DQNOVL-NEXT: vpsrld $31, %xmm0, %xmm0 -; DQNOVL-NEXT: vcvtdq2ps %xmm0, %xmm0 -; DQNOVL-NEXT: vzeroupper -; DQNOVL-NEXT: retq +; ALL-LABEL: ubto2f32: +; ALL: # %bb.0: +; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; ALL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; ALL-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 +; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; ALL-NEXT: retq %mask = icmp ne <2 x i32> %a, zeroinitializer %1 = uitofp <2 x i1> %mask to <2 x float> ret <2 x float> %1 } define <2 x double> @ubto2f64(<2 x i32> %a) { -; NOVLDQ-LABEL: ubto2f64: -; NOVLDQ: # %bb.0: -; NOVLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; NOVLDQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; NOVLDQ-NEXT: vptestmq %zmm0, %zmm0, %k1 -; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; NOVLDQ-NEXT: vpsrld $31, %xmm0, %xmm0 -; NOVLDQ-NEXT: vcvtudq2pd %ymm0, %zmm0 -; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; NOVLDQ-NEXT: vzeroupper -; NOVLDQ-NEXT: retq -; -; VLDQ-LABEL: ubto2f64: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VLDQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; VLDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 -; VLDQ-NEXT: vpmovm2d %k0, %xmm0 -; VLDQ-NEXT: vpsrld $31, %xmm0, %xmm0 -; VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0 -; VLDQ-NEXT: retq -; -; VLNODQ-LABEL: ubto2f64: -; VLNODQ: # %bb.0: -; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VLNODQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; VLNODQ-NEXT: vptestmq %xmm0, %xmm0, %k1 -; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; VLNODQ-NEXT: vpsrld $31, %xmm0, %xmm0 -; VLNODQ-NEXT: vcvtudq2pd %xmm0, %xmm0 -; VLNODQ-NEXT: retq -; -; DQNOVL-LABEL: ubto2f64: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; DQNOVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; DQNOVL-NEXT: vptestmq %zmm0, %zmm0, %k0 -; DQNOVL-NEXT: vpmovm2d %k0, %zmm0 -; DQNOVL-NEXT: vpsrld $31, %xmm0, %xmm0 -; DQNOVL-NEXT: vcvtudq2pd %ymm0, %zmm0 -; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; DQNOVL-NEXT: vzeroupper -; DQNOVL-NEXT: retq +; ALL-LABEL: ubto2f64: +; ALL: # %bb.0: +; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; ALL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; ALL-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 +; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; ALL-NEXT: vcvtdq2pd %xmm0, %xmm0 +; ALL-NEXT: retq %mask = icmp ne <2 x i32> %a, zeroinitializer %1 = uitofp <2 x i1> %mask to <2 x double> ret <2 x double> %1 diff --git a/llvm/test/CodeGen/X86/avx512-schedule.ll b/llvm/test/CodeGen/X86/avx512-schedule.ll index 096cbc7d459..a1ac679eb97 100755 --- a/llvm/test/CodeGen/X86/avx512-schedule.ll +++ b/llvm/test/CodeGen/X86/avx512-schedule.ll @@ -2427,22 +2427,24 @@ define <16 x double> @sbto16f64(<16 x double> %a) { ; GENERIC-LABEL: sbto16f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:1.00] -; GENERIC-NEXT: vcmpltpd %zmm1, %zmm2, %k0 # sched: [3:1.00] -; GENERIC-NEXT: vcmpltpd %zmm0, %zmm2, %k1 # sched: [3:1.00] -; GENERIC-NEXT: vpmovm2d %k1, %ymm0 # sched: [1:0.33] -; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [4:1.00] -; GENERIC-NEXT: vpmovm2d %k0, %ymm1 # sched: [1:0.33] +; GENERIC-NEXT: vcmpltpd %zmm0, %zmm2, %k0 # sched: [3:1.00] +; GENERIC-NEXT: vcmpltpd %zmm1, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: kunpckbw %k0, %k1, %k0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovm2d %k0, %zmm1 # sched: [1:0.33] +; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm0 # sched: [4:1.00] +; GENERIC-NEXT: vextracti64x4 $1, %zmm1, %ymm1 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sbto16f64: ; SKX: # %bb.0: ; SKX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vcmpltpd %zmm1, %zmm2, %k0 # sched: [3:1.00] -; SKX-NEXT: vcmpltpd %zmm0, %zmm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vpmovm2d %k1, %ymm0 # sched: [1:0.25] -; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00] -; SKX-NEXT: vpmovm2d %k0, %ymm1 # sched: [1:0.25] +; SKX-NEXT: vcmpltpd %zmm0, %zmm2, %k0 # sched: [3:1.00] +; SKX-NEXT: vcmpltpd %zmm1, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: kunpckbw %k0, %k1, %k0 # sched: [3:1.00] +; SKX-NEXT: vpmovm2d %k0, %zmm1 # sched: [1:0.25] +; SKX-NEXT: vcvtdq2pd %ymm1, %zmm0 # sched: [7:1.00] +; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm1 # sched: [3:1.00] ; SKX-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %cmpres = fcmp ogt <16 x double> %a, zeroinitializer @@ -2475,16 +2477,14 @@ define <8 x float> @sbto8f32(<8 x float> %a) { ; GENERIC-LABEL: sbto8f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:1.00] -; GENERIC-NEXT: vcmpltps %ymm0, %ymm1, %k0 # sched: [3:1.00] -; GENERIC-NEXT: vpmovm2d %k0, %ymm0 # sched: [1:0.33] +; GENERIC-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sbto8f32: ; SKX: # %bb.0: ; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vcmpltps %ymm0, %ymm1, %k0 # sched: [3:1.00] -; SKX-NEXT: vpmovm2d %k0, %ymm0 # sched: [1:0.25] +; SKX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 # sched: [4:0.33] ; SKX-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %cmpres = fcmp ogt <8 x float> %a, zeroinitializer @@ -2496,16 +2496,14 @@ define <4 x float> @sbto4f32(<4 x float> %a) { ; GENERIC-LABEL: sbto4f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:1.00] -; GENERIC-NEXT: vcmpltps %xmm0, %xmm1, %k0 # sched: [3:1.00] -; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33] +; GENERIC-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; GENERIC-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sbto4f32: ; SKX: # %bb.0: ; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vcmpltps %xmm0, %xmm1, %k0 # sched: [3:1.00] -; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25] +; SKX-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 # sched: [4:0.33] ; SKX-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %cmpres = fcmp ogt <4 x float> %a, zeroinitializer @@ -2539,7 +2537,8 @@ define <2 x float> @sbto2f32(<2 x float> %a) { ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vcmpltps %xmm0, %xmm1, %k0 # sched: [3:1.00] -; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33] +; GENERIC-NEXT: vpmovm2q %k0, %xmm0 # sched: [1:0.33] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] sched: [1:0.50] ; GENERIC-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2547,7 +2546,8 @@ define <2 x float> @sbto2f32(<2 x float> %a) { ; SKX: # %bb.0: ; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vcmpltps %xmm0, %xmm1, %k0 # sched: [3:1.00] -; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25] +; SKX-NEXT: vpmovm2q %k0, %xmm0 # sched: [1:0.25] +; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] sched: [1:1.00] ; SKX-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %cmpres = fcmp ogt <2 x float> %a, zeroinitializer @@ -2559,16 +2559,16 @@ define <2 x double> @sbto2f64(<2 x double> %a) { ; GENERIC-LABEL: sbto2f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:1.00] -; GENERIC-NEXT: vcmpltpd %xmm0, %xmm1, %k0 # sched: [3:1.00] -; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33] +; GENERIC-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; GENERIC-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sbto2f64: ; SKX: # %bb.0: ; SKX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vcmpltpd %xmm0, %xmm1, %k0 # sched: [3:1.00] -; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25] +; SKX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 # sched: [4:0.33] +; SKX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] sched: [1:1.00] ; SKX-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [5:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %cmpres = fcmp ogt <2 x double> %a, zeroinitializer @@ -2809,24 +2809,20 @@ define <16 x double> @ubto16f64(<16 x i32> %a) { ; GENERIC-LABEL: ubto16f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovd2m %zmm0, %k0 # sched: [1:0.33] -; GENERIC-NEXT: vpmovm2d %k0, %ymm0 # sched: [1:0.33] -; GENERIC-NEXT: vpsrld $31, %ymm0, %ymm0 # sched: [1:1.00] -; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [4:1.00] -; GENERIC-NEXT: kshiftrw $8, %k0, %k0 # sched: [1:1.00] -; GENERIC-NEXT: vpmovm2d %k0, %ymm1 # sched: [1:0.33] -; GENERIC-NEXT: vpsrld $31, %ymm1, %ymm1 # sched: [1:1.00] +; GENERIC-NEXT: vpmovm2d %k0, %zmm0 # sched: [1:0.33] +; GENERIC-NEXT: vpsrld $31, %zmm0, %zmm1 # sched: [3:1.00] +; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm0 # sched: [4:1.00] +; GENERIC-NEXT: vextracti64x4 $1, %zmm1, %ymm1 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ubto16f64: ; SKX: # %bb.0: ; SKX-NEXT: vpmovd2m %zmm0, %k0 # sched: [1:1.00] -; SKX-NEXT: vpmovm2d %k0, %ymm0 # sched: [1:0.25] -; SKX-NEXT: vpsrld $31, %ymm0, %ymm0 # sched: [1:0.50] -; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00] -; SKX-NEXT: kshiftrw $8, %k0, %k0 # sched: [3:1.00] -; SKX-NEXT: vpmovm2d %k0, %ymm1 # sched: [1:0.25] -; SKX-NEXT: vpsrld $31, %ymm1, %ymm1 # sched: [1:0.50] +; SKX-NEXT: vpmovm2d %k0, %zmm0 # sched: [1:0.25] +; SKX-NEXT: vpsrld $31, %zmm0, %zmm1 # sched: [1:0.50] +; SKX-NEXT: vcvtdq2pd %ymm1, %zmm0 # sched: [7:1.00] +; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm1 # sched: [3:1.00] ; SKX-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp slt <16 x i32> %a, zeroinitializer @@ -2837,18 +2833,16 @@ define <16 x double> @ubto16f64(<16 x i32> %a) { define <8 x float> @ubto8f32(<8 x i32> %a) { ; GENERIC-LABEL: ubto8f32: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovd2m %ymm0, %k0 # sched: [1:0.33] -; GENERIC-NEXT: vpmovm2d %k0, %ymm0 # sched: [1:0.33] -; GENERIC-NEXT: vpsrld $31, %ymm0, %ymm0 # sched: [1:1.00] -; GENERIC-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm0, %ymm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ubto8f32: ; SKX: # %bb.0: -; SKX-NEXT: vpmovd2m %ymm0, %k0 # sched: [1:1.00] -; SKX-NEXT: vpmovm2d %k0, %ymm0 # sched: [1:0.25] -; SKX-NEXT: vpsrld $31, %ymm0, %ymm0 # sched: [1:0.50] -; SKX-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [4:0.33] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; SKX-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm0, %ymm0 # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp slt <8 x i32> %a, zeroinitializer %1 = uitofp <8 x i1> %mask to <8 x float> @@ -2858,16 +2852,16 @@ define <8 x float> @ubto8f32(<8 x i32> %a) { define <8 x double> @ubto8f64(<8 x i32> %a) { ; GENERIC-LABEL: ubto8f64: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovd2m %ymm0, %k0 # sched: [1:0.33] -; GENERIC-NEXT: vpmovm2d %k0, %ymm0 # sched: [1:0.33] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: vpsrld $31, %ymm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ubto8f64: ; SKX: # %bb.0: -; SKX-NEXT: vpmovd2m %ymm0, %k0 # sched: [1:1.00] -; SKX-NEXT: vpmovm2d %k0, %ymm0 # sched: [1:0.25] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 # sched: [1:0.50] ; SKX-NEXT: vpsrld $31, %ymm0, %ymm0 # sched: [1:0.50] ; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] @@ -2879,18 +2873,16 @@ define <8 x double> @ubto8f64(<8 x i32> %a) { define <4 x float> @ubto4f32(<4 x i32> %a) { ; GENERIC-LABEL: ubto4f32: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovd2m %xmm0, %k0 # sched: [1:0.33] -; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33] -; GENERIC-NEXT: vpsrld $31, %xmm0, %xmm0 # sched: [1:1.00] -; GENERIC-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ubto4f32: ; SKX: # %bb.0: -; SKX-NEXT: vpmovd2m %xmm0, %k0 # sched: [1:1.00] -; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25] -; SKX-NEXT: vpsrld $31, %xmm0, %xmm0 # sched: [1:0.50] -; SKX-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:0.33] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0 # sched: [7:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp slt <4 x i32> %a, zeroinitializer %1 = uitofp <4 x i1> %mask to <4 x float> @@ -2900,16 +2892,16 @@ define <4 x float> @ubto4f32(<4 x i32> %a) { define <4 x double> @ubto4f64(<4 x i32> %a) { ; GENERIC-LABEL: ubto4f64: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovd2m %xmm0, %k0 # sched: [1:0.33] -; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: vpsrld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ubto4f64: ; SKX: # %bb.0: -; SKX-NEXT: vpmovd2m %xmm0, %k0 # sched: [1:1.00] -; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vpsrld $31, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] @@ -2923,20 +2915,18 @@ define <2 x float> @ubto2f32(<2 x i32> %a) { ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.50] -; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k0 # sched: [1:1.00] -; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33] -; GENERIC-NEXT: vpsrld $31, %xmm0, %xmm0 # sched: [1:1.00] -; GENERIC-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00] +; GENERIC-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:0.50] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ubto2f32: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.33] -; SKX-NEXT: vptestmq %xmm0, %xmm0, %k0 # sched: [3:1.00] -; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25] -; SKX-NEXT: vpsrld $31, %xmm0, %xmm0 # sched: [1:0.50] -; SKX-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:0.33] +; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:0.50] +; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <2 x i32> %a, zeroinitializer %1 = uitofp <2 x i1> %mask to <2 x float> @@ -2948,20 +2938,20 @@ define <2 x double> @ubto2f64(<2 x i32> %a) { ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.50] -; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k0 # sched: [1:1.00] -; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33] -; GENERIC-NEXT: vpsrld $31, %xmm0, %xmm0 # sched: [1:1.00] -; GENERIC-NEXT: vcvtudq2pd %xmm0, %xmm0 # sched: [4:1.00] +; GENERIC-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:0.50] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] sched: [1:0.50] +; GENERIC-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ubto2f64: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.33] -; SKX-NEXT: vptestmq %xmm0, %xmm0, %k0 # sched: [3:1.00] -; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25] -; SKX-NEXT: vpsrld $31, %xmm0, %xmm0 # sched: [1:0.50] -; SKX-NEXT: vcvtudq2pd %xmm0, %xmm0 # sched: [5:1.00] +; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:0.50] +; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] sched: [1:1.00] +; SKX-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [5:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <2 x i32> %a, zeroinitializer %1 = uitofp <2 x i1> %mask to <2 x double> diff --git a/llvm/test/CodeGen/X86/sse-fsignum.ll b/llvm/test/CodeGen/X86/sse-fsignum.ll index 3fdb7c2e586..5e2c4c74bd8 100644 --- a/llvm/test/CodeGen/X86/sse-fsignum.ll +++ b/llvm/test/CodeGen/X86/sse-fsignum.ll @@ -10,44 +10,17 @@ ; define void @signum32a(<4 x float>*) { -; AVX1-LABEL: signum32a: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovaps (%rdi), %xmm0 -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vcvtdq2ps %xmm2, %xmm2 -; AVX1-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vcvtdq2ps %xmm0, %xmm0 -; AVX1-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vmovaps %xmm0, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: signum32a: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vcvtdq2ps %xmm2, %xmm2 -; AVX2-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vcvtdq2ps %xmm0, %xmm0 -; AVX2-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vmovaps %xmm0, (%rdi) -; AVX2-NEXT: retq -; -; AVX512F-LABEL: signum32a: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vmovaps (%rdi), %xmm0 -; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vcmpltps %zmm1, %zmm0, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; AVX512F-NEXT: vcvtdq2ps %xmm2, %xmm2 -; AVX512F-NEXT: vcmpltps %zmm0, %zmm1, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vcvtdq2ps %xmm0, %xmm0 -; AVX512F-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; AVX512F-NEXT: vmovaps %xmm0, (%rdi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX-LABEL: signum32a: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vcvtdq2ps %xmm2, %xmm2 +; AVX-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vmovaps %xmm0, (%rdi) +; AVX-NEXT: retq entry: %1 = load <4 x float>, <4 x float>* %0 %2 = fcmp olt <4 x float> %1, zeroinitializer @@ -60,48 +33,19 @@ entry: } define void @signum64a(<2 x double>*) { -; AVX1-LABEL: signum64a: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovapd (%rdi), %xmm0 -; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmpltpd %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX1-NEXT: vcvtdq2pd %xmm2, %xmm2 -; AVX1-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX1-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX1-NEXT: vsubpd %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vmovapd %xmm0, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: signum64a: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovapd (%rdi), %xmm0 -; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vcmpltpd %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX2-NEXT: vcvtdq2pd %xmm2, %xmm2 -; AVX2-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX2-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX2-NEXT: vsubpd %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vmovapd %xmm0, (%rdi) -; AVX2-NEXT: retq -; -; AVX512F-LABEL: signum64a: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vmovapd (%rdi), %xmm0 -; AVX512F-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vcmpltpd %zmm1, %zmm0, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; AVX512F-NEXT: vcvtdq2pd %xmm2, %xmm2 -; AVX512F-NEXT: vcmpltpd %zmm0, %zmm1, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX512F-NEXT: vsubpd %xmm0, %xmm2, %xmm0 -; AVX512F-NEXT: vmovapd %xmm0, (%rdi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX-LABEL: signum64a: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmovapd (%rdi), %xmm0 +; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vcmpltpd %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX-NEXT: vcvtdq2pd %xmm2, %xmm2 +; AVX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: vsubpd %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vmovapd %xmm0, (%rdi) +; AVX-NEXT: retq entry: %1 = load <2 x double>, <2 x double>* %0 %2 = fcmp olt <2 x double> %1, zeroinitializer @@ -118,46 +62,18 @@ entry: ; define void @signum32b(<8 x float>*) { -; AVX1-LABEL: signum32b: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovaps (%rdi), %ymm0 -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmpltps %ymm1, %ymm0, %ymm2 -; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2 -; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, (%rdi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: signum32b: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vcmpltps %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vcvtdq2ps %ymm2, %ymm2 -; AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX2-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vmovaps %ymm0, (%rdi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: signum32b: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vmovaps (%rdi), %ymm0 -; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vcmpltps %zmm1, %zmm0, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; AVX512F-NEXT: vcvtdq2ps %ymm2, %ymm2 -; AVX512F-NEXT: vcmpltps %zmm0, %zmm1, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX512F-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vmovaps %ymm0, (%rdi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX-LABEL: signum32b: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmovaps (%rdi), %ymm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vcmpltps %ymm1, %ymm0, %ymm2 +; AVX-NEXT: vcvtdq2ps %ymm2, %ymm2 +; AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX-NEXT: vsubps %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vmovaps %ymm0, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq entry: %1 = load <8 x float>, <8 x float>* %0 %2 = fcmp olt <8 x float> %1, zeroinitializer @@ -208,11 +124,11 @@ define void @signum64b(<4 x double>*) { ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: vmovapd (%rdi), %ymm0 ; AVX512F-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vcmpltpd %zmm1, %zmm0, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; AVX512F-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vpmovqd %zmm2, %ymm2 ; AVX512F-NEXT: vcvtdq2pd %xmm2, %ymm2 -; AVX512F-NEXT: vcmpltpd %zmm0, %zmm1, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX512F-NEXT: vsubpd %ymm0, %ymm2, %ymm0 ; AVX512F-NEXT: vmovapd %ymm0, (%rdi) |

