diff options
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 17 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx512-cvt.ll | 232 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx512-ext.ll | 21 | ||||
-rwxr-xr-x | llvm/test/CodeGen/X86/avx512-schedule.ll | 122 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx512-vec-cmp.ll | 23 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll | 73 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll | 40 |
7 files changed, 335 insertions, 193 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 8dbff9f0565..74433ce7e23 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -16472,10 +16472,17 @@ static SDValue LowerZERO_EXTEND_Mask(SDValue Op, SDLoc DL(Op); unsigned NumElts = VT.getVectorNumElements(); - // Extend VT if the scalar type is v8/v16 and BWI is not supported. + // For all vectors, but vXi8 we can just emit a sign_extend a shift. This + // avoids a constant pool load. + if (VT.getVectorElementType() != MVT::i8) { + SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In); + return DAG.getNode(ISD::SRL, DL, VT, Extend, + DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT)); + } + + // Extend VT if BWI is not supported. MVT ExtVT = VT; - if (!Subtarget.hasBWI() && - (VT.getVectorElementType().getSizeInBits() <= 16)) { + if (!Subtarget.hasBWI()) { // If v16i32 is to be avoided, we'll need to split and concatenate. if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG); @@ -16499,9 +16506,9 @@ static SDValue LowerZERO_EXTEND_Mask(SDValue Op, SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero); - // Truncate if we had to extend i16/i8 above. + // Truncate if we had to extend above. if (VT != ExtVT) { - WideVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts); + WideVT = MVT::getVectorVT(MVT::i8, NumElts); SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal); } diff --git a/llvm/test/CodeGen/X86/avx512-cvt.ll b/llvm/test/CodeGen/X86/avx512-cvt.ll index 573270296ed..01af5286cb4 100644 --- a/llvm/test/CodeGen/X86/avx512-cvt.ll +++ b/llvm/test/CodeGen/X86/avx512-cvt.ll @@ -1892,14 +1892,16 @@ define <16 x float> @ubto16f32(<16 x i32> %a) { ; NODQ: # %bb.0: ; NODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; NODQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; NODQ-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NODQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NODQ-NEXT: vpsrld $31, %zmm0, %zmm0 ; NODQ-NEXT: vcvtdq2ps %zmm0, %zmm0 ; NODQ-NEXT: retq ; ; DQ-LABEL: ubto16f32: ; DQ: # %bb.0: -; DQ-NEXT: vpmovd2m %zmm0, %k1 -; DQ-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; DQ-NEXT: vpmovd2m %zmm0, %k0 +; DQ-NEXT: vpmovm2d %k0, %zmm0 +; DQ-NEXT: vpsrld $31, %zmm0, %zmm0 ; DQ-NEXT: vcvtdq2ps %zmm0, %zmm0 ; DQ-NEXT: retq %mask = icmp slt <16 x i32> %a, zeroinitializer @@ -1912,22 +1914,24 @@ define <16 x double> @ubto16f64(<16 x i32> %a) { ; NOVLDQ: # %bb.0: ; NOVLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; NOVLDQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; NOVLDQ-NEXT: movl {{.*}}(%rip), %eax -; NOVLDQ-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: vpsrld $31, %ymm0, %ymm0 ; NOVLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0 ; NOVLDQ-NEXT: kshiftrw $8, %k1, %k1 -; NOVLDQ-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z} +; NOVLDQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NOVLDQ-NEXT: vpsrld $31, %ymm1, %ymm1 ; NOVLDQ-NEXT: vcvtdq2pd %ymm1, %zmm1 ; NOVLDQ-NEXT: retq ; ; VLDQ-LABEL: ubto16f64: ; VLDQ: # %bb.0: -; VLDQ-NEXT: vpmovd2m %zmm0, %k1 -; VLDQ-NEXT: movl {{.*}}(%rip), %eax -; VLDQ-NEXT: vpbroadcastd %eax, %ymm0 {%k1} {z} +; VLDQ-NEXT: vpmovd2m %zmm0, %k0 +; VLDQ-NEXT: vpmovm2d %k0, %ymm0 +; VLDQ-NEXT: vpsrld $31, %ymm0, %ymm0 ; VLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0 -; VLDQ-NEXT: kshiftrw $8, %k1, %k1 -; VLDQ-NEXT: vpbroadcastd %eax, %ymm1 {%k1} {z} +; VLDQ-NEXT: kshiftrw $8, %k0, %k0 +; VLDQ-NEXT: vpmovm2d %k0, %ymm1 +; VLDQ-NEXT: vpsrld $31, %ymm1, %ymm1 ; VLDQ-NEXT: vcvtdq2pd %ymm1, %zmm1 ; VLDQ-NEXT: retq ; @@ -1935,22 +1939,25 @@ define <16 x double> @ubto16f64(<16 x i32> %a) { ; VLNODQ: # %bb.0: ; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; VLNODQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; VLNODQ-NEXT: movl {{.*}}(%rip), %eax -; VLNODQ-NEXT: vpbroadcastd %eax, %ymm0 {%k1} {z} +; VLNODQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; VLNODQ-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} {z} +; VLNODQ-NEXT: vpsrld $31, %ymm0, %ymm0 ; VLNODQ-NEXT: vcvtdq2pd %ymm0, %zmm0 ; VLNODQ-NEXT: kshiftrw $8, %k1, %k1 -; VLNODQ-NEXT: vpbroadcastd %eax, %ymm1 {%k1} {z} +; VLNODQ-NEXT: vmovdqa32 %ymm1, %ymm1 {%k1} {z} +; VLNODQ-NEXT: vpsrld $31, %ymm1, %ymm1 ; VLNODQ-NEXT: vcvtdq2pd %ymm1, %zmm1 ; VLNODQ-NEXT: retq ; ; AVX512DQ-LABEL: ubto16f64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: movl {{.*}}(%rip), %eax -; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vpsrld $31, %ymm0, %ymm0 ; AVX512DQ-NEXT: vcvtdq2pd %ymm0, %zmm0 -; AVX512DQ-NEXT: kshiftrw $8, %k1, %k1 -; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z} +; AVX512DQ-NEXT: kshiftrw $8, %k0, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 +; AVX512DQ-NEXT: vpsrld $31, %ymm1, %ymm1 ; AVX512DQ-NEXT: vcvtdq2pd %ymm1, %zmm1 ; AVX512DQ-NEXT: retq %mask = icmp slt <16 x i32> %a, zeroinitializer @@ -1964,14 +1971,16 @@ define <8 x float> @ubto8f32(<8 x i32> %a) { ; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; NOVLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; NOVLDQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; NOVLDQ-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: vpsrld $31, %ymm0, %ymm0 ; NOVLDQ-NEXT: vcvtdq2ps %ymm0, %ymm0 ; NOVLDQ-NEXT: retq ; ; VLDQ-LABEL: ubto8f32: ; VLDQ: # %bb.0: -; VLDQ-NEXT: vpmovd2m %ymm0, %k1 -; VLDQ-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} +; VLDQ-NEXT: vpmovd2m %ymm0, %k0 +; VLDQ-NEXT: vpmovm2d %k0, %ymm0 +; VLDQ-NEXT: vpsrld $31, %ymm0, %ymm0 ; VLDQ-NEXT: vcvtdq2ps %ymm0, %ymm0 ; VLDQ-NEXT: retq ; @@ -1979,15 +1988,18 @@ define <8 x float> @ubto8f32(<8 x i32> %a) { ; VLNODQ: # %bb.0: ; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; VLNODQ-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 -; VLNODQ-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} +; VLNODQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; VLNODQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; VLNODQ-NEXT: vpsrld $31, %ymm0, %ymm0 ; VLNODQ-NEXT: vcvtdq2ps %ymm0, %ymm0 ; VLNODQ-NEXT: retq ; ; AVX512DQ-LABEL: ubto8f32: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vpsrld $31, %ymm0, %ymm0 ; AVX512DQ-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX512DQ-NEXT: retq %mask = icmp slt <8 x i32> %a, zeroinitializer @@ -2001,14 +2013,16 @@ define <8 x double> @ubto8f64(<8 x i32> %a) { ; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; NOVLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; NOVLDQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; NOVLDQ-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: vpsrld $31, %ymm0, %ymm0 ; NOVLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0 ; NOVLDQ-NEXT: retq ; ; VLDQ-LABEL: ubto8f64: ; VLDQ: # %bb.0: -; VLDQ-NEXT: vpmovd2m %ymm0, %k1 -; VLDQ-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} +; VLDQ-NEXT: vpmovd2m %ymm0, %k0 +; VLDQ-NEXT: vpmovm2d %k0, %ymm0 +; VLDQ-NEXT: vpsrld $31, %ymm0, %ymm0 ; VLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0 ; VLDQ-NEXT: retq ; @@ -2016,15 +2030,18 @@ define <8 x double> @ubto8f64(<8 x i32> %a) { ; VLNODQ: # %bb.0: ; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; VLNODQ-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 -; VLNODQ-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} +; VLNODQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; VLNODQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; VLNODQ-NEXT: vpsrld $31, %ymm0, %ymm0 ; VLNODQ-NEXT: vcvtdq2pd %ymm0, %zmm0 ; VLNODQ-NEXT: retq ; ; AVX512DQ-LABEL: ubto8f64: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vpsrld $31, %ymm0, %ymm0 ; AVX512DQ-NEXT: vcvtdq2pd %ymm0, %zmm0 ; AVX512DQ-NEXT: retq %mask = icmp slt <8 x i32> %a, zeroinitializer @@ -2038,15 +2055,17 @@ define <4 x float> @ubto4f32(<4 x i32> %a) { ; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NOVLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; NOVLDQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; NOVLDQ-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: vpsrld $31, %xmm0, %xmm0 ; NOVLDQ-NEXT: vcvtdq2ps %xmm0, %xmm0 ; NOVLDQ-NEXT: vzeroupper ; NOVLDQ-NEXT: retq ; ; VLDQ-LABEL: ubto4f32: ; VLDQ: # %bb.0: -; VLDQ-NEXT: vpmovd2m %xmm0, %k1 -; VLDQ-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} +; VLDQ-NEXT: vpmovd2m %xmm0, %k0 +; VLDQ-NEXT: vpmovm2d %k0, %xmm0 +; VLDQ-NEXT: vpsrld $31, %xmm0, %xmm0 ; VLDQ-NEXT: vcvtdq2ps %xmm0, %xmm0 ; VLDQ-NEXT: retq ; @@ -2054,15 +2073,18 @@ define <4 x float> @ubto4f32(<4 x i32> %a) { ; VLNODQ: # %bb.0: ; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; VLNODQ-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 -; VLNODQ-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} +; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; VLNODQ-NEXT: vpsrld $31, %xmm0, %xmm0 ; VLNODQ-NEXT: vcvtdq2ps %xmm0, %xmm0 ; VLNODQ-NEXT: retq ; ; AVX512DQ-LABEL: ubto4f32: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vpsrld $31, %xmm0, %xmm0 ; AVX512DQ-NEXT: vcvtdq2ps %xmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -2077,14 +2099,16 @@ define <4 x double> @ubto4f64(<4 x i32> %a) { ; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NOVLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; NOVLDQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; NOVLDQ-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: vpsrld $31, %xmm0, %xmm0 ; NOVLDQ-NEXT: vcvtdq2pd %xmm0, %ymm0 ; NOVLDQ-NEXT: retq ; ; VLDQ-LABEL: ubto4f64: ; VLDQ: # %bb.0: -; VLDQ-NEXT: vpmovd2m %xmm0, %k1 -; VLDQ-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} +; VLDQ-NEXT: vpmovd2m %xmm0, %k0 +; VLDQ-NEXT: vpmovm2d %k0, %xmm0 +; VLDQ-NEXT: vpsrld $31, %xmm0, %xmm0 ; VLDQ-NEXT: vcvtdq2pd %xmm0, %ymm0 ; VLDQ-NEXT: retq ; @@ -2092,15 +2116,18 @@ define <4 x double> @ubto4f64(<4 x i32> %a) { ; VLNODQ: # %bb.0: ; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; VLNODQ-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 -; VLNODQ-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} +; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; VLNODQ-NEXT: vpsrld $31, %xmm0, %xmm0 ; VLNODQ-NEXT: vcvtdq2pd %xmm0, %ymm0 ; VLNODQ-NEXT: retq ; ; AVX512DQ-LABEL: ubto4f64: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vpsrld $31, %xmm0, %xmm0 ; AVX512DQ-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX512DQ-NEXT: retq %mask = icmp slt <4 x i32> %a, zeroinitializer @@ -2109,49 +2136,98 @@ define <4 x double> @ubto4f64(<4 x i32> %a) { } define <2 x float> @ubto2f32(<2 x i32> %a) { -; NOVL-LABEL: ubto2f32: -; NOVL: # %bb.0: -; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; NOVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; NOVL-NEXT: vptestmq %zmm0, %zmm0, %k1 -; NOVL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} -; NOVL-NEXT: vcvtdq2ps %xmm0, %xmm0 -; NOVL-NEXT: vzeroupper -; NOVL-NEXT: retq +; NOVLDQ-LABEL: ubto2f32: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; NOVLDQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; NOVLDQ-NEXT: vptestmq %zmm0, %zmm0, %k1 +; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: vpsrld $31, %xmm0, %xmm0 +; NOVLDQ-NEXT: vcvtdq2ps %xmm0, %xmm0 +; NOVLDQ-NEXT: vzeroupper +; NOVLDQ-NEXT: retq ; -; VL-LABEL: ubto2f32: -; VL: # %bb.0: -; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; VL-NEXT: vptestmq %xmm0, %xmm0, %k1 -; VL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} -; VL-NEXT: vcvtdq2ps %xmm0, %xmm0 -; VL-NEXT: retq +; VLDQ-LABEL: ubto2f32: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VLDQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; VLDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 +; VLDQ-NEXT: vpmovm2d %k0, %xmm0 +; VLDQ-NEXT: vpsrld $31, %xmm0, %xmm0 +; VLDQ-NEXT: vcvtdq2ps %xmm0, %xmm0 +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: ubto2f32: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VLNODQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; VLNODQ-NEXT: vptestmq %xmm0, %xmm0, %k1 +; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; VLNODQ-NEXT: vpsrld $31, %xmm0, %xmm0 +; VLNODQ-NEXT: vcvtdq2ps %xmm0, %xmm0 +; VLNODQ-NEXT: retq +; +; AVX512DQ-LABEL: ubto2f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX512DQ-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vpsrld $31, %xmm0, %xmm0 +; AVX512DQ-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq %mask = icmp ne <2 x i32> %a, zeroinitializer %1 = uitofp <2 x i1> %mask to <2 x float> ret <2 x float> %1 } define <2 x double> @ubto2f64(<2 x i32> %a) { -; NOVL-LABEL: ubto2f64: -; NOVL: # %bb.0: -; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; NOVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; NOVL-NEXT: vptestmq %zmm0, %zmm0, %k1 -; NOVL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} -; NOVL-NEXT: vcvtudq2pd %ymm0, %zmm0 -; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; NOVL-NEXT: vzeroupper -; NOVL-NEXT: retq +; NOVLDQ-LABEL: ubto2f64: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; NOVLDQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; NOVLDQ-NEXT: vptestmq %zmm0, %zmm0, %k1 +; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: vpsrld $31, %xmm0, %xmm0 +; NOVLDQ-NEXT: vcvtudq2pd %ymm0, %zmm0 +; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; NOVLDQ-NEXT: vzeroupper +; NOVLDQ-NEXT: retq ; -; VL-LABEL: ubto2f64: -; VL: # %bb.0: -; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; VL-NEXT: vptestmq %xmm0, %xmm0, %k1 -; VL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} -; VL-NEXT: vcvtudq2pd %xmm0, %xmm0 -; VL-NEXT: retq +; VLDQ-LABEL: ubto2f64: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VLDQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; VLDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 +; VLDQ-NEXT: vpmovm2d %k0, %xmm0 +; VLDQ-NEXT: vpsrld $31, %xmm0, %xmm0 +; VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0 +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: ubto2f64: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VLNODQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; VLNODQ-NEXT: vptestmq %xmm0, %xmm0, %k1 +; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; VLNODQ-NEXT: vpsrld $31, %xmm0, %xmm0 +; VLNODQ-NEXT: vcvtudq2pd %xmm0, %xmm0 +; VLNODQ-NEXT: retq +; +; AVX512DQ-LABEL: ubto2f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX512DQ-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vpsrld $31, %xmm0, %xmm0 +; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq %mask = icmp ne <2 x i32> %a, zeroinitializer %1 = uitofp <2 x i1> %mask to <2 x double> ret <2 x double> %1 diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll index d7bc88439a9..128855313ab 100644 --- a/llvm/test/CodeGen/X86/avx512-ext.ll +++ b/llvm/test/CodeGen/X86/avx512-ext.ll @@ -1294,13 +1294,15 @@ define <16 x i32> @zext_16i1_to_16xi32(i16 %b) { ; KNL-LABEL: zext_16i1_to_16xi32: ; KNL: # %bb.0: ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpsrld $31, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: zext_16i1_to_16xi32: ; SKX: # %bb.0: -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; SKX-NEXT: kmovd %edi, %k0 +; SKX-NEXT: vpmovm2d %k0, %zmm0 +; SKX-NEXT: vpsrld $31, %zmm0, %zmm0 ; SKX-NEXT: retq %a = bitcast i16 %b to <16 x i1> %c = zext <16 x i1> %a to <16 x i32> @@ -1311,13 +1313,15 @@ define <8 x i64> @zext_8i1_to_8xi64(i8 %b) { ; KNL-LABEL: zext_8i1_to_8xi64: ; KNL: # %bb.0: ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpsrlq $63, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: zext_8i1_to_8xi64: ; SKX: # %bb.0: -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; SKX-NEXT: kmovd %edi, %k0 +; SKX-NEXT: vpmovm2q %k0, %zmm0 +; SKX-NEXT: vpsrlq $63, %zmm0, %zmm0 ; SKX-NEXT: retq %a = bitcast i8 %b to <8 x i1> %c = zext <8 x i1> %a to <8 x i64> @@ -1685,8 +1689,9 @@ define <32 x i16> @zext_32xi1_to_32xi16(<32 x i16> %x, <32 x i16> %y) #0 { ; ; SKX-LABEL: zext_32xi1_to_32xi16: ; SKX: # %bb.0: -; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 -; SKX-NEXT: vmovdqu16 {{.*}}(%rip), %zmm0 {%k1} {z} +; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 +; SKX-NEXT: vpmovm2w %k0, %zmm0 +; SKX-NEXT: vpsrlw $15, %zmm0, %zmm0 ; SKX-NEXT: retq %mask = icmp eq <32 x i16> %x, %y %1 = zext <32 x i1> %mask to <32 x i16> diff --git a/llvm/test/CodeGen/X86/avx512-schedule.ll b/llvm/test/CodeGen/X86/avx512-schedule.ll index 7a2aa56f7cc..096cbc7d459 100755 --- a/llvm/test/CodeGen/X86/avx512-schedule.ll +++ b/llvm/test/CodeGen/X86/avx512-schedule.ll @@ -2787,15 +2787,17 @@ define <16 x float> @usto16f32(<16 x i16> %a) { define <16 x float> @ubto16f32(<16 x i32> %a) { ; GENERIC-LABEL: ubto16f32: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovd2m %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [5:1.00] +; GENERIC-NEXT: vpmovd2m %zmm0, %k0 # sched: [1:0.33] +; GENERIC-NEXT: vpmovm2d %k0, %zmm0 # sched: [1:0.33] +; GENERIC-NEXT: vpsrld $31, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ubto16f32: ; SKX: # %bb.0: -; SKX-NEXT: vpmovd2m %zmm0, %k1 # sched: [1:1.00] -; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [8:0.50] +; SKX-NEXT: vpmovd2m %zmm0, %k0 # sched: [1:1.00] +; SKX-NEXT: vpmovm2d %k0, %zmm0 # sched: [1:0.25] +; SKX-NEXT: vpsrld $31, %zmm0, %zmm0 # sched: [1:0.50] ; SKX-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp slt <16 x i32> %a, zeroinitializer @@ -2806,23 +2808,25 @@ define <16 x float> @ubto16f32(<16 x i32> %a) { define <16 x double> @ubto16f64(<16 x i32> %a) { ; GENERIC-LABEL: ubto16f64: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovd2m %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: movl {{.*}}(%rip), %eax # sched: [5:0.50] -; GENERIC-NEXT: vpbroadcastd %eax, %ymm0 {%k1} {z} # sched: [1:1.00] +; GENERIC-NEXT: vpmovd2m %zmm0, %k0 # sched: [1:0.33] +; GENERIC-NEXT: vpmovm2d %k0, %ymm0 # sched: [1:0.33] +; GENERIC-NEXT: vpsrld $31, %ymm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [4:1.00] -; GENERIC-NEXT: kshiftrw $8, %k1, %k1 # sched: [1:1.00] -; GENERIC-NEXT: vpbroadcastd %eax, %ymm1 {%k1} {z} # sched: [1:1.00] +; GENERIC-NEXT: kshiftrw $8, %k0, %k0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovm2d %k0, %ymm1 # sched: [1:0.33] +; GENERIC-NEXT: vpsrld $31, %ymm1, %ymm1 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ubto16f64: ; SKX: # %bb.0: -; SKX-NEXT: vpmovd2m %zmm0, %k1 # sched: [1:1.00] -; SKX-NEXT: movl {{.*}}(%rip), %eax # sched: [5:0.50] -; SKX-NEXT: vpbroadcastd %eax, %ymm0 {%k1} {z} # sched: [3:1.00] +; SKX-NEXT: vpmovd2m %zmm0, %k0 # sched: [1:1.00] +; SKX-NEXT: vpmovm2d %k0, %ymm0 # sched: [1:0.25] +; SKX-NEXT: vpsrld $31, %ymm0, %ymm0 # sched: [1:0.50] ; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00] -; SKX-NEXT: kshiftrw $8, %k1, %k1 # sched: [3:1.00] -; SKX-NEXT: vpbroadcastd %eax, %ymm1 {%k1} {z} # sched: [3:1.00] +; SKX-NEXT: kshiftrw $8, %k0, %k0 # sched: [3:1.00] +; SKX-NEXT: vpmovm2d %k0, %ymm1 # sched: [1:0.25] +; SKX-NEXT: vpsrld $31, %ymm1, %ymm1 # sched: [1:0.50] ; SKX-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp slt <16 x i32> %a, zeroinitializer @@ -2833,15 +2837,17 @@ define <16 x double> @ubto16f64(<16 x i32> %a) { define <8 x float> @ubto8f32(<8 x i32> %a) { ; GENERIC-LABEL: ubto8f32: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovd2m %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [5:1.00] +; GENERIC-NEXT: vpmovd2m %ymm0, %k0 # sched: [1:0.33] +; GENERIC-NEXT: vpmovm2d %k0, %ymm0 # sched: [1:0.33] +; GENERIC-NEXT: vpsrld $31, %ymm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ubto8f32: ; SKX: # %bb.0: -; SKX-NEXT: vpmovd2m %ymm0, %k1 # sched: [1:1.00] -; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [8:0.50] +; SKX-NEXT: vpmovd2m %ymm0, %k0 # sched: [1:1.00] +; SKX-NEXT: vpmovm2d %k0, %ymm0 # sched: [1:0.25] +; SKX-NEXT: vpsrld $31, %ymm0, %ymm0 # sched: [1:0.50] ; SKX-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp slt <8 x i32> %a, zeroinitializer @@ -2852,15 +2858,17 @@ define <8 x float> @ubto8f32(<8 x i32> %a) { define <8 x double> @ubto8f64(<8 x i32> %a) { ; GENERIC-LABEL: ubto8f64: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovd2m %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [5:1.00] +; GENERIC-NEXT: vpmovd2m %ymm0, %k0 # sched: [1:0.33] +; GENERIC-NEXT: vpmovm2d %k0, %ymm0 # sched: [1:0.33] +; GENERIC-NEXT: vpsrld $31, %ymm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ubto8f64: ; SKX: # %bb.0: -; SKX-NEXT: vpmovd2m %ymm0, %k1 # sched: [1:1.00] -; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [8:0.50] +; SKX-NEXT: vpmovd2m %ymm0, %k0 # sched: [1:1.00] +; SKX-NEXT: vpmovm2d %k0, %ymm0 # sched: [1:0.25] +; SKX-NEXT: vpsrld $31, %ymm0, %ymm0 # sched: [1:0.50] ; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp slt <8 x i32> %a, zeroinitializer @@ -2871,15 +2879,17 @@ define <8 x double> @ubto8f64(<8 x i32> %a) { define <4 x float> @ubto4f32(<4 x i32> %a) { ; GENERIC-LABEL: ubto4f32: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [5:1.00] +; GENERIC-NEXT: vpmovd2m %xmm0, %k0 # sched: [1:0.33] +; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33] +; GENERIC-NEXT: vpsrld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ubto4f32: ; SKX: # %bb.0: -; SKX-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:1.00] -; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [7:0.50] +; SKX-NEXT: vpmovd2m %xmm0, %k0 # sched: [1:1.00] +; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25] +; SKX-NEXT: vpsrld $31, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp slt <4 x i32> %a, zeroinitializer @@ -2890,15 +2900,17 @@ define <4 x float> @ubto4f32(<4 x i32> %a) { define <4 x double> @ubto4f64(<4 x i32> %a) { ; GENERIC-LABEL: ubto4f64: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [5:1.00] +; GENERIC-NEXT: vpmovd2m %xmm0, %k0 # sched: [1:0.33] +; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33] +; GENERIC-NEXT: vpsrld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ubto4f64: ; SKX: # %bb.0: -; SKX-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:1.00] -; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [7:0.50] +; SKX-NEXT: vpmovd2m %xmm0, %k0 # sched: [1:1.00] +; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25] +; SKX-NEXT: vpsrld $31, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp slt <4 x i32> %a, zeroinitializer @@ -2911,8 +2923,9 @@ define <2 x float> @ubto2f32(<2 x i32> %a) { ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.50] -; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [1:1.00] -; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [5:1.00] +; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33] +; GENERIC-NEXT: vpsrld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2920,8 +2933,9 @@ define <2 x float> @ubto2f32(<2 x i32> %a) { ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.33] -; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [3:1.00] -; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [7:0.50] +; SKX-NEXT: vptestmq %xmm0, %xmm0, %k0 # sched: [3:1.00] +; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25] +; SKX-NEXT: vpsrld $31, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <2 x i32> %a, zeroinitializer @@ -2934,8 +2948,9 @@ define <2 x double> @ubto2f64(<2 x i32> %a) { ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.50] -; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [1:1.00] -; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [5:1.00] +; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33] +; GENERIC-NEXT: vpsrld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vcvtudq2pd %xmm0, %xmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2943,8 +2958,9 @@ define <2 x double> @ubto2f64(<2 x i32> %a) { ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.33] -; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [3:1.00] -; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [7:0.50] +; SKX-NEXT: vptestmq %xmm0, %xmm0, %k0 # sched: [3:1.00] +; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25] +; SKX-NEXT: vpsrld $31, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vcvtudq2pd %xmm0, %xmm0 # sched: [5:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <2 x i32> %a, zeroinitializer @@ -4244,14 +4260,16 @@ define <8 x double> @fpext_test(<8 x float> %a) nounwind readnone { define <16 x i32> @zext_16i1_to_16xi32(i16 %b) { ; GENERIC-LABEL: zext_16i1_to_16xi32: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [5:1.00] +; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33] +; GENERIC-NEXT: vpmovm2d %k0, %zmm0 # sched: [1:0.33] +; GENERIC-NEXT: vpsrld $31, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_16i1_to_16xi32: ; SKX: # %bb.0: -; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [8:0.50] +; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00] +; SKX-NEXT: vpmovm2d %k0, %zmm0 # sched: [1:0.25] +; SKX-NEXT: vpsrld $31, %zmm0, %zmm0 # sched: [1:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %a = bitcast i16 %b to <16 x i1> %c = zext <16 x i1> %a to <16 x i32> @@ -4261,14 +4279,16 @@ define <16 x i32> @zext_16i1_to_16xi32(i16 %b) { define <8 x i64> @zext_8i1_to_8xi64(i8 %b) { ; GENERIC-LABEL: zext_8i1_to_8xi64: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [5:1.00] +; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33] +; GENERIC-NEXT: vpmovm2q %k0, %zmm0 # sched: [1:0.33] +; GENERIC-NEXT: vpsrlq $63, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_8i1_to_8xi64: ; SKX: # %bb.0: -; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; SKX-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [8:0.50] +; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00] +; SKX-NEXT: vpmovm2q %k0, %zmm0 # sched: [1:0.25] +; SKX-NEXT: vpsrlq $63, %zmm0, %zmm0 # sched: [1:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %a = bitcast i8 %b to <8 x i1> %c = zext <8 x i1> %a to <8 x i64> @@ -4653,14 +4673,16 @@ define <64 x i8> @zext_64xi1_to_64xi8(<64 x i8> %x, <64 x i8> %y) #0 { define <32 x i16> @zext_32xi1_to_32xi16(<32 x i16> %x, <32 x i16> %y) #0 { ; GENERIC-LABEL: zext_32xi1_to_32xi16: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] -; GENERIC-NEXT: vmovdqu16 {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [4:0.50] +; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 # sched: [3:1.00] +; GENERIC-NEXT: vpmovm2w %k0, %zmm0 # sched: [1:0.33] +; GENERIC-NEXT: vpsrlw $15, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_32xi1_to_32xi16: ; SKX: # %bb.0: -; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqu16 {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [8:0.50] +; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 # sched: [3:1.00] +; SKX-NEXT: vpmovm2w %k0, %zmm0 # sched: [1:0.25] +; SKX-NEXT: vpsrlw $15, %zmm0, %zmm0 # sched: [1:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp eq <32 x i16> %x, %y %1 = zext <32 x i1> %mask to <32 x i16> diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll index 0c5669dec76..a141b28a1bf 100644 --- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll @@ -288,11 +288,19 @@ define i64 @test12_v64i16(<64 x i16> %a, <64 x i16> %b) nounwind { } define <16 x i32> @test13(<16 x float>%a, <16 x float>%b) -; CHECK-LABEL: test13: -; CHECK: ## %bb.0: -; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1 -; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} -; CHECK-NEXT: retq +; AVX512-LABEL: test13: +; AVX512: ## %bb.0: +; AVX512-NEXT: vcmpeqps %zmm1, %zmm0, %k1 +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpsrld $31, %zmm0, %zmm0 +; AVX512-NEXT: retq +; +; SKX-LABEL: test13: +; SKX: ## %bb.0: +; SKX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; SKX-NEXT: vpmovm2d %k0, %zmm0 +; SKX-NEXT: vpsrld $31, %zmm0, %zmm0 +; SKX-NEXT: retq { %cmpvector_i = fcmp oeq <16 x float> %a, %b %conv = zext <16 x i1> %cmpvector_i to <16 x i32> @@ -906,8 +914,9 @@ define <2 x i64> @test46(<2 x float> %x, <2 x float> %y) #0 { ; ; SKX-LABEL: test46: ; SKX: ## %bb.0: -; SKX-NEXT: vcmpeqps %xmm1, %xmm0, %k1 -; SKX-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} +; SKX-NEXT: vcmpeqps %xmm1, %xmm0, %k0 +; SKX-NEXT: vpmovm2q %k0, %xmm0 +; SKX-NEXT: vpsrlq $63, %xmm0, %xmm0 ; SKX-NEXT: retq %mask = fcmp oeq <2 x float> %x, %y %1 = zext <2 x i1> %mask to <2 x i64> diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll index c694cf29313..6cd52c4d25c 100644 --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll @@ -50,15 +50,17 @@ define <2 x i64> @ext_i2_2i64(i2 %a0) { ; AVX512F-LABEL: ext_i2_2i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpsrlq $63, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VLBW-LABEL: ext_i2_2i64: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: kmovd %edi, %k1 -; AVX512VLBW-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} +; AVX512VLBW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512VLBW-NEXT: vpsrlq $63, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq %1 = bitcast i2 %a0 to <2 x i1> %2 = zext <2 x i1> %1 to <2 x i64> @@ -99,15 +101,17 @@ define <4 x i32> @ext_i4_4i32(i4 %a0) { ; AVX512F-LABEL: ext_i4_4i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpsrld $31, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VLBW-LABEL: ext_i4_4i32: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: kmovd %edi, %k1 -; AVX512VLBW-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} +; AVX512VLBW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512VLBW-NEXT: vpsrld $31, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq %1 = bitcast i4 %a0 to <4 x i1> %2 = zext <4 x i1> %1 to <4 x i32> @@ -150,16 +154,17 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) { ; AVX512F-LABEL: ext_i8_8i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VLBW-LABEL: ext_i8_8i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: kmovd %edi, %k1 -; AVX512VLBW-NEXT: vmovdqu16 {{.*}}(%rip), %xmm0 {%k1} {z} +; AVX512VLBW-NEXT: kmovd %edi, %k0 +; AVX512VLBW-NEXT: vpmovm2w %k0, %xmm0 +; AVX512VLBW-NEXT: vpsrlw $15, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq %1 = bitcast i8 %a0 to <8 x i1> %2 = zext <8 x i1> %1 to <8 x i16> @@ -289,14 +294,16 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) { ; AVX512F-LABEL: ext_i4_4i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpsrlq $63, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VLBW-LABEL: ext_i4_4i64: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: kmovd %edi, %k1 -; AVX512VLBW-NEXT: vpbroadcastq {{.*}}(%rip), %ymm0 {%k1} {z} +; AVX512VLBW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; AVX512VLBW-NEXT: vpsrlq $63, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq %1 = bitcast i4 %a0 to <4 x i1> %2 = zext <4 x i1> %1 to <4 x i64> @@ -350,14 +357,16 @@ define <8 x i32> @ext_i8_8i32(i8 %a0) { ; AVX512F-LABEL: ext_i8_8i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpsrld $31, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VLBW-LABEL: ext_i8_8i32: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: kmovd %edi, %k1 -; AVX512VLBW-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} +; AVX512VLBW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512VLBW-NEXT: vpsrld $31, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq %1 = bitcast i8 %a0 to <8 x i1> %2 = zext <8 x i1> %1 to <8 x i32> @@ -413,14 +422,16 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) { ; AVX512F-LABEL: ext_i16_16i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VLBW-LABEL: ext_i16_16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: kmovd %edi, %k1 -; AVX512VLBW-NEXT: vmovdqu16 {{.*}}(%rip), %ymm0 {%k1} {z} +; AVX512VLBW-NEXT: kmovd %edi, %k0 +; AVX512VLBW-NEXT: vpmovm2w %k0, %ymm0 +; AVX512VLBW-NEXT: vpsrlw $15, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq %1 = bitcast i16 %a0 to <16 x i1> %2 = zext <16 x i1> %1 to <16 x i16> @@ -611,13 +622,15 @@ define <8 x i64> @ext_i8_8i64(i8 %a0) { ; AVX512F-LABEL: ext_i8_8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpsrlq $63, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VLBW-LABEL: ext_i8_8i64: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: kmovd %edi, %k1 -; AVX512VLBW-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512VLBW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512VLBW-NEXT: vpsrlq $63, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq %1 = bitcast i8 %a0 to <8 x i1> %2 = zext <8 x i1> %1 to <8 x i64> @@ -694,13 +707,15 @@ define <16 x i32> @ext_i16_16i32(i16 %a0) { ; AVX512F-LABEL: ext_i16_16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpsrld $31, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VLBW-LABEL: ext_i16_16i32: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: kmovd %edi, %k1 -; AVX512VLBW-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512VLBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512VLBW-NEXT: vpsrld $31, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq %1 = bitcast i16 %a0 to <16 x i1> %2 = zext <16 x i1> %1 to <16 x i32> @@ -786,17 +801,19 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: shrl $16, %edi ; AVX512F-NEXT: kmovw %edi, %k2 -; AVX512F-NEXT: movl {{.*}}(%rip), %eax -; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k2} {z} +; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: vpsrlw $15, %ymm1, %ymm1 ; AVX512F-NEXT: retq ; ; AVX512VLBW-LABEL: ext_i32_32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: kmovd %edi, %k1 -; AVX512VLBW-NEXT: vmovdqu16 {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512VLBW-NEXT: kmovd %edi, %k0 +; AVX512VLBW-NEXT: vpmovm2w %k0, %zmm0 +; AVX512VLBW-NEXT: vpsrlw $15, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq %1 = bitcast i32 %a0 to <32 x i1> %2 = zext <32 x i1> %1 to <32 x i16> diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll b/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll index e78c1d2d787..dc60fd5c672 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll @@ -135,8 +135,10 @@ define <8 x i16> @testv8i1_zext_v8i16(<8 x i32>* %p) { ; AVX256: # %bb.0: ; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 -; AVX256-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} +; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX256-NEXT: vpmovdw %ymm0, %xmm0 +; AVX256-NEXT: vpsrlw $15, %xmm0, %xmm0 ; AVX256-NEXT: vzeroupper ; AVX256-NEXT: retq ; @@ -144,8 +146,10 @@ define <8 x i16> @testv8i1_zext_v8i16(<8 x i32>* %p) { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 -; AVX512VL-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} +; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vpsrlw $15, %xmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -169,15 +173,14 @@ define <16 x i8> @testv16i1_zext_v16i8(<8 x i32>* %p, <8 x i32>* %q) { ; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 ; AVX256-NEXT: vpcmpeqd (%rsi), %ymm0, %k2 -; AVX256-NEXT: movl {{.*}}(%rip), %eax -; AVX256-NEXT: vpbroadcastd %eax, %ymm0 {%k2} {z} +; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z} +; AVX256-NEXT: vpmovdw %ymm1, %xmm1 +; AVX256-NEXT: vpsrlw $15, %xmm1, %xmm1 +; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX256-NEXT: vpmovdw %ymm0, %xmm0 -; AVX256-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX256-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX256-NEXT: vpbroadcastd %eax, %ymm2 {%k1} {z} -; AVX256-NEXT: vpmovdw %ymm2, %xmm2 -; AVX256-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX256-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX256-NEXT: vpsrlw $15, %xmm0, %xmm0 +; AVX256-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX256-NEXT: vzeroupper ; AVX256-NEXT: retq ; @@ -218,12 +221,13 @@ define <16 x i16> @testv16i1_zext_v16i16(<8 x i32>* %p, <8 x i32>* %q) { ; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 ; AVX256-NEXT: vpcmpeqd (%rsi), %ymm0, %k2 -; AVX256-NEXT: movl {{.*}}(%rip), %eax -; AVX256-NEXT: vpbroadcastd %eax, %ymm0 {%k1} {z} -; AVX256-NEXT: vpmovdw %ymm0, %xmm0 -; AVX256-NEXT: vpbroadcastd %eax, %ymm1 {%k2} {z} +; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} ; AVX256-NEXT: vpmovdw %ymm1, %xmm1 -; AVX256-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k2} {z} +; AVX256-NEXT: vpmovdw %ymm0, %xmm0 +; AVX256-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX256-NEXT: vpsrlw $15, %ymm0, %ymm0 ; AVX256-NEXT: retq ; ; AVX512VL-LABEL: testv16i1_zext_v16i16: @@ -232,8 +236,9 @@ define <16 x i16> @testv16i1_zext_v16i16(<8 x i32>* %p, <8 x i32>* %q) { ; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k0 ; AVX512VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k1 ; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1 -; AVX512VL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $15, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512F-LABEL: testv16i1_zext_v16i16: @@ -243,8 +248,9 @@ define <16 x i16> @testv16i1_zext_v16i16(<8 x i32>* %p, <8 x i32>* %q) { ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512F-NEXT: kunpckbw %k0, %k1, %k1 -; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0 ; AVX512F-NEXT: retq %in = load <8 x i32>, <8 x i32>* %p %cmp = icmp eq <8 x i32> %in, zeroinitializer |