diff options
author | Craig Topper <craig.topper@intel.com> | 2018-11-16 22:53:00 +0000 |
---|---|---|
committer | Craig Topper <craig.topper@intel.com> | 2018-11-16 22:53:00 +0000 |
commit | ee0333b4a9d5baf3165f09cd6fb4b520fae620e6 (patch) | |
tree | 7aec9c72e73a4c0b98a5628bf0e66b17519e281f /llvm/test/CodeGen | |
parent | ac35cd330ac06cce6a158e102fc9265f97476a7b (diff) | |
download | bcm5719-llvm-ee0333b4a9d5baf3165f09cd6fb4b520fae620e6.tar.gz bcm5719-llvm-ee0333b4a9d5baf3165f09cd6fb4b520fae620e6.zip |
[X86] Add custom promotion of narrow fp_to_uint/fp_to_sint operations under -x86-experimental-vector-widening-legalization.
This tries to force the result type to vXi32 followed by a truncate. This can help avoid scalarization that would otherwise occur.
There's some annoying examples of an avx512 truncate instruction followed by a packus where we should really be able to just use one truncate. But overall this is still a net improvement.
llvm-svn: 347105
Diffstat (limited to 'llvm/test/CodeGen')
-rw-r--r-- | llvm/test/CodeGen/X86/avx512-cvt-widen.ll | 42 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vec_cast2.ll | 94 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vec_cast3.ll | 28 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vec_fp_to_int-widen.ll | 334 |
4 files changed, 97 insertions, 401 deletions
diff --git a/llvm/test/CodeGen/X86/avx512-cvt-widen.ll b/llvm/test/CodeGen/X86/avx512-cvt-widen.ll index a4d9a8272a3..82681ec7d41 100644 --- a/llvm/test/CodeGen/X86/avx512-cvt-widen.ll +++ b/llvm/test/CodeGen/X86/avx512-cvt-widen.ll @@ -502,33 +502,21 @@ define <8 x i16> @f64to8us(<8 x double> %f) { } define <8 x i8> @f64to8uc(<8 x double> %f) { -; ALL-LABEL: f64to8uc: -; ALL: # %bb.0: -; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; ALL-NEXT: vcvttsd2si %xmm1, %eax -; ALL-NEXT: vcvttsd2si %xmm0, %ecx -; ALL-NEXT: vmovd %ecx, %xmm1 -; ALL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm2 -; ALL-NEXT: vcvttsd2si %xmm2, %eax -; ALL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; ALL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] -; ALL-NEXT: vcvttsd2si %xmm2, %eax -; ALL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm2 -; ALL-NEXT: vcvttsd2si %xmm2, %eax -; ALL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; ALL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] -; ALL-NEXT: vcvttsd2si %xmm2, %eax -; ALL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; ALL-NEXT: vextractf32x4 $3, %zmm0, %xmm0 -; ALL-NEXT: vcvttsd2si %xmm0, %eax -; ALL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; ALL-NEXT: vcvttsd2si %xmm0, %eax -; ALL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0 -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq +; NOVL-LABEL: f64to8uc: +; NOVL: # %bb.0: +; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; NOVL-NEXT: vpmovdw %zmm0, %ymm0 +; NOVL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; NOVL-NEXT: vzeroupper +; NOVL-NEXT: retq +; +; VL-LABEL: f64to8uc: +; VL: # %bb.0: +; VL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; VL-NEXT: vpmovdw %ymm0, %xmm0 +; VL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; VL-NEXT: vzeroupper +; VL-NEXT: retq %res = fptoui <8 x double> %f to <8 x i8> ret <8 x i8> %res } diff --git a/llvm/test/CodeGen/X86/vec_cast2.ll b/llvm/test/CodeGen/X86/vec_cast2.ll index 1bc4b690487..d746bf59121 100644 --- a/llvm/test/CodeGen/X86/vec_cast2.ll +++ b/llvm/test/CodeGen/X86/vec_cast2.ll @@ -172,29 +172,10 @@ define <8 x i8> @cvt_v8f32_v8i8(<8 x float> %src) { ; ; CHECK-WIDE-LABEL: cvt_v8f32_v8i8: ; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx -; CHECK-WIDE-NEXT: vmovd %ecx, %xmm1 -; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax -; CHECK-WIDE-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax -; CHECK-WIDE-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax -; CHECK-WIDE-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax -; CHECK-WIDE-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax -; CHECK-WIDE-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax -; CHECK-WIDE-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0 +; CHECK-WIDE-NEXT: vcvttps2dq %ymm0, %ymm0 +; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm1 +; CHECK-WIDE-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; CHECK-WIDE-NEXT: vzeroupper ; CHECK-WIDE-NEXT: retl %res = fptosi <8 x float> %src to <8 x i8> @@ -229,17 +210,8 @@ define <4 x i8> @cvt_v4f32_v4i8(<4 x float> %src) { ; ; CHECK-WIDE-LABEL: cvt_v4f32_v4i8: ; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx -; CHECK-WIDE-NEXT: vmovd %ecx, %xmm1 -; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax -; CHECK-WIDE-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax -; CHECK-WIDE-NEXT: vpinsrb $3, %eax, %xmm1, %xmm0 +; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-WIDE-NEXT: retl %res = fptosi <4 x float> %src to <4 x i8> ret <4 x i8> %res @@ -253,11 +225,8 @@ define <4 x i16> @cvt_v4f32_v4i16(<4 x float> %src) { ; ; CHECK-WIDE-LABEL: cvt_v4f32_v4i16: ; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: ## kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-WIDE-NEXT: vcvttps2dq %ymm0, %ymm0 -; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-WIDE-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vzeroupper +; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; CHECK-WIDE-NEXT: retl %res = fptosi <4 x float> %src to <4 x i16> ret <4 x i16> %res @@ -274,29 +243,10 @@ define <8 x i8> @cvt_v8f32_v8u8(<8 x float> %src) { ; ; CHECK-WIDE-LABEL: cvt_v8f32_v8u8: ; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx -; CHECK-WIDE-NEXT: vmovd %ecx, %xmm1 -; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax -; CHECK-WIDE-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax -; CHECK-WIDE-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax -; CHECK-WIDE-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax -; CHECK-WIDE-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax -; CHECK-WIDE-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax -; CHECK-WIDE-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0 +; CHECK-WIDE-NEXT: vcvttps2dq %ymm0, %ymm0 +; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm1 +; CHECK-WIDE-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; CHECK-WIDE-NEXT: vzeroupper ; CHECK-WIDE-NEXT: retl %res = fptoui <8 x float> %src to <8 x i8> @@ -331,17 +281,8 @@ define <4 x i8> @cvt_v4f32_v4u8(<4 x float> %src) { ; ; CHECK-WIDE-LABEL: cvt_v4f32_v4u8: ; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx -; CHECK-WIDE-NEXT: vmovd %ecx, %xmm1 -; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax -; CHECK-WIDE-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax -; CHECK-WIDE-NEXT: vpinsrb $3, %eax, %xmm1, %xmm0 +; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-WIDE-NEXT: retl %res = fptoui <4 x float> %src to <4 x i8> ret <4 x i8> %res @@ -355,11 +296,8 @@ define <4 x i16> @cvt_v4f32_v4u16(<4 x float> %src) { ; ; CHECK-WIDE-LABEL: cvt_v4f32_v4u16: ; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: ## kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-WIDE-NEXT: vcvttps2dq %ymm0, %ymm0 -; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-WIDE-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vzeroupper +; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; CHECK-WIDE-NEXT: retl %res = fptoui <4 x float> %src to <4 x i16> ret <4 x i16> %res diff --git a/llvm/test/CodeGen/X86/vec_cast3.ll b/llvm/test/CodeGen/X86/vec_cast3.ll index e8662b8cc34..e4e6aa52ff5 100644 --- a/llvm/test/CodeGen/X86/vec_cast3.ll +++ b/llvm/test/CodeGen/X86/vec_cast3.ll @@ -117,11 +117,8 @@ define <2 x i8> @cvt_v2f32_v2i8(<2 x float> %src) { ; ; CHECK-WIDE-LABEL: cvt_v2f32_v2i8: ; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx -; CHECK-WIDE-NEXT: vmovd %ecx, %xmm0 -; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-WIDE-NEXT: retl %res = fptosi <2 x float> %src to <2 x i8> ret <2 x i8> %res @@ -136,11 +133,8 @@ define <2 x i16> @cvt_v2f32_v2i16(<2 x float> %src) { ; ; CHECK-WIDE-LABEL: cvt_v2f32_v2i16: ; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: ## kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-WIDE-NEXT: vcvttps2dq %ymm0, %ymm0 -; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-WIDE-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vzeroupper +; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; CHECK-WIDE-NEXT: retl %res = fptosi <2 x float> %src to <2 x i16> ret <2 x i16> %res @@ -170,11 +164,8 @@ define <2 x i8> @cvt_v2f32_v2u8(<2 x float> %src) { ; ; CHECK-WIDE-LABEL: cvt_v2f32_v2u8: ; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx -; CHECK-WIDE-NEXT: vmovd %ecx, %xmm0 -; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-WIDE-NEXT: retl %res = fptoui <2 x float> %src to <2 x i8> ret <2 x i8> %res @@ -189,11 +180,8 @@ define <2 x i16> @cvt_v2f32_v2u16(<2 x float> %src) { ; ; CHECK-WIDE-LABEL: cvt_v2f32_v2u16: ; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: ## kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-WIDE-NEXT: vcvttps2dq %ymm0, %ymm0 -; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-WIDE-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vzeroupper +; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; CHECK-WIDE-NEXT: retl %res = fptoui <2 x float> %src to <2 x i16> ret <2 x i16> %res diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int-widen.ll b/llvm/test/CodeGen/X86/vec_fp_to_int-widen.ll index f4e0147e3f1..47078daa925 100644 --- a/llvm/test/CodeGen/X86/vec_fp_to_int-widen.ll +++ b/llvm/test/CodeGen/X86/vec_fp_to_int-widen.ll @@ -2310,31 +2310,17 @@ define <4 x i32> @fptosi_2f128_to_4i32(<2 x fp128> %a) nounwind { define <2 x i8> @fptosi_2f32_to_2i8(<2 x float> %a) { ; SSE-LABEL: fptosi_2f32_to_2i8: ; SSE: # %bb.0: -; SSE-NEXT: cvttss2si %xmm0, %eax -; SSE-NEXT: movzbl %al, %eax -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE-NEXT: cvttss2si %xmm0, %ecx -; SSE-NEXT: shll $8, %ecx -; SSE-NEXT: orl %eax, %ecx -; SSE-NEXT: movd %ecx, %xmm0 +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: retq ; -; VEX-LABEL: fptosi_2f32_to_2i8: -; VEX: # %bb.0: -; VEX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; VEX-NEXT: vcvttss2si %xmm1, %eax -; VEX-NEXT: vcvttss2si %xmm0, %ecx -; VEX-NEXT: vmovd %ecx, %xmm0 -; VEX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 -; VEX-NEXT: retq -; -; AVX512-LABEL: fptosi_2f32_to_2i8: -; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512-NEXT: vcvttps2dq %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: fptosi_2f32_to_2i8: +; AVX: # %bb.0: +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: retq %cvt = fptosi <2 x float> %a to <2 x i8> ret <2 x i8> %cvt } @@ -2342,64 +2328,15 @@ define <2 x i8> @fptosi_2f32_to_2i8(<2 x float> %a) { define <2 x i16> @fptosi_2f32_to_2i16(<2 x float> %a) { ; SSE-LABEL: fptosi_2f32_to_2i16: ; SSE: # %bb.0: -; SSE-NEXT: cvttss2si %xmm0, %eax -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE-NEXT: cvttss2si %xmm0, %ecx -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: pinsrw $1, %ecx, %xmm0 +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: retq ; -; AVX1-LABEL: fptosi_2f32_to_2i16: -; AVX1: # %bb.0: -; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: fptosi_2f32_to_2i16: -; AVX2: # %bb.0: -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: fptosi_2f32_to_2i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512F-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptosi_2f32_to_2i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptosi_2f32_to_2i16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512DQ-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptosi_2f32_to_2i16: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VLDQ-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX512VLDQ-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VLDQ-NEXT: vzeroupper -; AVX512VLDQ-NEXT: retq +; AVX-LABEL: fptosi_2f32_to_2i16: +; AVX: # %bb.0: +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX-NEXT: retq %cvt = fptosi <2 x float> %a to <2 x i16> ret <2 x i16> %cvt } @@ -2407,31 +2344,17 @@ define <2 x i16> @fptosi_2f32_to_2i16(<2 x float> %a) { define <2 x i8> @fptoui_2f32_to_2i8(<2 x float> %a) { ; SSE-LABEL: fptoui_2f32_to_2i8: ; SSE: # %bb.0: -; SSE-NEXT: cvttss2si %xmm0, %eax -; SSE-NEXT: movzbl %al, %eax -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE-NEXT: cvttss2si %xmm0, %ecx -; SSE-NEXT: shll $8, %ecx -; SSE-NEXT: orl %eax, %ecx -; SSE-NEXT: movd %ecx, %xmm0 +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: retq ; -; VEX-LABEL: fptoui_2f32_to_2i8: -; VEX: # %bb.0: -; VEX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; VEX-NEXT: vcvttss2si %xmm1, %eax -; VEX-NEXT: vcvttss2si %xmm0, %ecx -; VEX-NEXT: vmovd %ecx, %xmm0 -; VEX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 -; VEX-NEXT: retq -; -; AVX512-LABEL: fptoui_2f32_to_2i8: -; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512-NEXT: vcvttps2dq %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: fptoui_2f32_to_2i8: +; AVX: # %bb.0: +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: retq %cvt = fptoui <2 x float> %a to <2 x i8> ret <2 x i8> %cvt } @@ -2439,64 +2362,15 @@ define <2 x i8> @fptoui_2f32_to_2i8(<2 x float> %a) { define <2 x i16> @fptoui_2f32_to_2i16(<2 x float> %a) { ; SSE-LABEL: fptoui_2f32_to_2i16: ; SSE: # %bb.0: -; SSE-NEXT: cvttss2si %xmm0, %eax -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE-NEXT: cvttss2si %xmm0, %ecx -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: pinsrw $1, %ecx, %xmm0 +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: retq ; -; AVX1-LABEL: fptoui_2f32_to_2i16: -; AVX1: # %bb.0: -; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: fptoui_2f32_to_2i16: -; AVX2: # %bb.0: -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: fptoui_2f32_to_2i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512F-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptoui_2f32_to_2i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptoui_2f32_to_2i16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512DQ-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptoui_2f32_to_2i16: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VLDQ-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX512VLDQ-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VLDQ-NEXT: vzeroupper -; AVX512VLDQ-NEXT: retq +; AVX-LABEL: fptoui_2f32_to_2i16: +; AVX: # %bb.0: +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX-NEXT: retq %cvt = fptoui <2 x float> %a to <2 x i16> ret <2 x i16> %cvt } @@ -2504,22 +2378,16 @@ define <2 x i16> @fptoui_2f32_to_2i16(<2 x float> %a) { define <2 x i8> @fptosi_2f64_to_2i8(<2 x double> %a) { ; SSE-LABEL: fptosi_2f64_to_2i8: ; SSE: # %bb.0: -; SSE-NEXT: cvttsd2si %xmm0, %eax -; SSE-NEXT: movzbl %al, %eax -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttsd2si %xmm0, %ecx -; SSE-NEXT: shll $8, %ecx -; SSE-NEXT: orl %eax, %ecx -; SSE-NEXT: movd %ecx, %xmm0 +; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: andpd {{.*}}(%rip), %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: fptosi_2f64_to_2i8: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vcvttsd2si %xmm1, %eax -; AVX-NEXT: vcvttsd2si %xmm0, %ecx -; AVX-NEXT: vmovd %ecx, %xmm0 -; AVX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: retq %cvt = fptosi <2 x double> %a to <2 x i8> ret <2 x i8> %cvt @@ -2528,55 +2396,15 @@ define <2 x i8> @fptosi_2f64_to_2i8(<2 x double> %a) { define <2 x i16> @fptosi_2f64_to_2i16(<2 x double> %a) { ; SSE-LABEL: fptosi_2f64_to_2i16: ; SSE: # %bb.0: -; SSE-NEXT: cvttsd2si %xmm0, %eax -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttsd2si %xmm0, %ecx -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: pinsrw $1, %ecx, %xmm0 +; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: retq ; -; VEX-LABEL: fptosi_2f64_to_2i16: -; VEX: # %bb.0: -; VEX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; VEX-NEXT: vcvttsd2si %xmm1, %eax -; VEX-NEXT: vcvttsd2si %xmm0, %ecx -; VEX-NEXT: vmovd %ecx, %xmm0 -; VEX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 -; VEX-NEXT: retq -; -; AVX512F-LABEL: fptosi_2f64_to_2i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptosi_2f64_to_2i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptosi_2f64_to_2i16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptosi_2f64_to_2i16: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512VLDQ-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VLDQ-NEXT: vzeroupper -; AVX512VLDQ-NEXT: retq +; AVX-LABEL: fptosi_2f64_to_2i16: +; AVX: # %bb.0: +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX-NEXT: retq %cvt = fptosi <2 x double> %a to <2 x i16> ret <2 x i16> %cvt } @@ -2584,22 +2412,16 @@ define <2 x i16> @fptosi_2f64_to_2i16(<2 x double> %a) { define <2 x i8> @fptoui_2f64_to_2i8(<2 x double> %a) { ; SSE-LABEL: fptoui_2f64_to_2i8: ; SSE: # %bb.0: -; SSE-NEXT: cvttsd2si %xmm0, %eax -; SSE-NEXT: movzbl %al, %eax -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttsd2si %xmm0, %ecx -; SSE-NEXT: shll $8, %ecx -; SSE-NEXT: orl %eax, %ecx -; SSE-NEXT: movd %ecx, %xmm0 +; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: andpd {{.*}}(%rip), %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: fptoui_2f64_to_2i8: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vcvttsd2si %xmm1, %eax -; AVX-NEXT: vcvttsd2si %xmm0, %ecx -; AVX-NEXT: vmovd %ecx, %xmm0 -; AVX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: retq %cvt = fptoui <2 x double> %a to <2 x i8> ret <2 x i8> %cvt @@ -2608,55 +2430,15 @@ define <2 x i8> @fptoui_2f64_to_2i8(<2 x double> %a) { define <2 x i16> @fptoui_2f64_to_2i16(<2 x double> %a) { ; SSE-LABEL: fptoui_2f64_to_2i16: ; SSE: # %bb.0: -; SSE-NEXT: cvttsd2si %xmm0, %eax -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttsd2si %xmm0, %ecx -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: pinsrw $1, %ecx, %xmm0 +; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: retq ; -; VEX-LABEL: fptoui_2f64_to_2i16: -; VEX: # %bb.0: -; VEX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; VEX-NEXT: vcvttsd2si %xmm1, %eax -; VEX-NEXT: vcvttsd2si %xmm0, %ecx -; VEX-NEXT: vmovd %ecx, %xmm0 -; VEX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 -; VEX-NEXT: retq -; -; AVX512F-LABEL: fptoui_2f64_to_2i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptoui_2f64_to_2i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptoui_2f64_to_2i16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptoui_2f64_to_2i16: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512VLDQ-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VLDQ-NEXT: vzeroupper -; AVX512VLDQ-NEXT: retq +; AVX-LABEL: fptoui_2f64_to_2i16: +; AVX: # %bb.0: +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX-NEXT: retq %cvt = fptoui <2 x double> %a to <2 x i16> ret <2 x i16> %cvt } |