summaryrefslogtreecommitdiffstats
path: root/llvm/test
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test')
-rw-r--r--llvm/test/CodeGen/X86/avx512-cvt.ll200
-rw-r--r--llvm/test/CodeGen/X86/ftrunc.ll157
-rw-r--r--llvm/test/CodeGen/X86/vec_int_to_fp.ll513
3 files changed, 372 insertions, 498 deletions
diff --git a/llvm/test/CodeGen/X86/avx512-cvt.ll b/llvm/test/CodeGen/X86/avx512-cvt.ll
index 4c089ac379c..e99cdaf1ce9 100644
--- a/llvm/test/CodeGen/X86/avx512-cvt.ll
+++ b/llvm/test/CodeGen/X86/avx512-cvt.ll
@@ -484,32 +484,12 @@ define <4 x float> @ulto4f32(<4 x i64> %a) {
define <8 x double> @ulto8f64(<8 x i64> %a) {
; NODQ-LABEL: ulto8f64:
; NODQ: # %bb.0:
-; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; NODQ-NEXT: vpextrq $1, %xmm1, %rax
-; NODQ-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2
-; NODQ-NEXT: vmovq %xmm1, %rax
-; NODQ-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1
-; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; NODQ-NEXT: vpextrq $1, %xmm2, %rax
-; NODQ-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm3
-; NODQ-NEXT: vmovq %xmm2, %rax
-; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2
-; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm2
-; NODQ-NEXT: vpextrq $1, %xmm2, %rax
-; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm3
-; NODQ-NEXT: vmovq %xmm2, %rax
-; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2
-; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; NODQ-NEXT: vpextrq $1, %xmm0, %rax
-; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm3
-; NODQ-NEXT: vmovq %xmm0, %rax
-; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm0
-; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; NODQ-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1
+; NODQ-NEXT: vporq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; NODQ-NEXT: vpsrlq $32, %zmm0, %zmm0
+; NODQ-NEXT: vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; NODQ-NEXT: vsubpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; NODQ-NEXT: vaddpd %zmm0, %zmm1, %zmm0
; NODQ-NEXT: retq
;
; VLDQ-LABEL: ulto8f64:
@@ -524,32 +504,12 @@ define <8 x double> @ulto8f64(<8 x i64> %a) {
;
; KNL_WIDEN-LABEL: ulto8f64:
; KNL_WIDEN: # %bb.0:
-; KNL_WIDEN-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; KNL_WIDEN-NEXT: vpextrq $1, %xmm1, %rax
-; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2
-; KNL_WIDEN-NEXT: vmovq %xmm1, %rax
-; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1
-; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; KNL_WIDEN-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; KNL_WIDEN-NEXT: vpextrq $1, %xmm2, %rax
-; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm3
-; KNL_WIDEN-NEXT: vmovq %xmm2, %rax
-; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2
-; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL_WIDEN-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; KNL_WIDEN-NEXT: vextracti128 $1, %ymm0, %xmm2
-; KNL_WIDEN-NEXT: vpextrq $1, %xmm2, %rax
-; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm3
-; KNL_WIDEN-NEXT: vmovq %xmm2, %rax
-; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2
-; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL_WIDEN-NEXT: vpextrq $1, %xmm0, %rax
-; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm3
-; KNL_WIDEN-NEXT: vmovq %xmm0, %rax
-; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm0
-; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; KNL_WIDEN-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; KNL_WIDEN-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; KNL_WIDEN-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1
+; KNL_WIDEN-NEXT: vporq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; KNL_WIDEN-NEXT: vpsrlq $32, %zmm0, %zmm0
+; KNL_WIDEN-NEXT: vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; KNL_WIDEN-NEXT: vsubpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; KNL_WIDEN-NEXT: vaddpd %zmm0, %zmm1, %zmm0
; KNL_WIDEN-NEXT: retq
%b = uitofp <8 x i64> %a to <8 x double>
ret <8 x double> %b
@@ -558,58 +518,22 @@ define <8 x double> @ulto8f64(<8 x i64> %a) {
define <16 x double> @ulto16f64(<16 x i64> %a) {
; NODQ-LABEL: ulto16f64:
; NODQ: # %bb.0:
-; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; NODQ-NEXT: vpextrq $1, %xmm2, %rax
-; NODQ-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm3
-; NODQ-NEXT: vmovq %xmm2, %rax
-; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2
-; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm3
-; NODQ-NEXT: vpextrq $1, %xmm3, %rax
-; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm4
-; NODQ-NEXT: vmovq %xmm3, %rax
-; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
-; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm3
-; NODQ-NEXT: vpextrq $1, %xmm3, %rax
-; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
-; NODQ-NEXT: vmovq %xmm3, %rax
-; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
-; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; NODQ-NEXT: vpextrq $1, %xmm0, %rax
-; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
-; NODQ-NEXT: vmovq %xmm0, %rax
-; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm0
-; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm2
-; NODQ-NEXT: vpextrq $1, %xmm2, %rax
-; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
-; NODQ-NEXT: vmovq %xmm2, %rax
-; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm2
-; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm3
-; NODQ-NEXT: vpextrq $1, %xmm3, %rax
-; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
-; NODQ-NEXT: vmovq %xmm3, %rax
-; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
-; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm3
-; NODQ-NEXT: vpextrq $1, %xmm3, %rax
-; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
-; NODQ-NEXT: vmovq %xmm3, %rax
-; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
-; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; NODQ-NEXT: vpextrq $1, %xmm1, %rax
-; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
-; NODQ-NEXT: vmovq %xmm1, %rax
-; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm1
-; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
-; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1
+; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm2 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295]
+; NODQ-NEXT: vpandq %zmm2, %zmm0, %zmm3
+; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm4 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
+; NODQ-NEXT: vporq %zmm4, %zmm3, %zmm3
+; NODQ-NEXT: vpsrlq $32, %zmm0, %zmm0
+; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm5 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
+; NODQ-NEXT: vporq %zmm5, %zmm0, %zmm0
+; NODQ-NEXT: vbroadcastsd {{.*#+}} zmm6 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
+; NODQ-NEXT: vsubpd %zmm6, %zmm0, %zmm0
+; NODQ-NEXT: vaddpd %zmm0, %zmm3, %zmm0
+; NODQ-NEXT: vpandq %zmm2, %zmm1, %zmm2
+; NODQ-NEXT: vporq %zmm4, %zmm2, %zmm2
+; NODQ-NEXT: vpsrlq $32, %zmm1, %zmm1
+; NODQ-NEXT: vporq %zmm5, %zmm1, %zmm1
+; NODQ-NEXT: vsubpd %zmm6, %zmm1, %zmm1
+; NODQ-NEXT: vaddpd %zmm1, %zmm2, %zmm1
; NODQ-NEXT: retq
;
; VLDQ-LABEL: ulto16f64:
@@ -626,58 +550,22 @@ define <16 x double> @ulto16f64(<16 x i64> %a) {
;
; KNL_WIDEN-LABEL: ulto16f64:
; KNL_WIDEN: # %bb.0:
-; KNL_WIDEN-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; KNL_WIDEN-NEXT: vpextrq $1, %xmm2, %rax
-; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm3
-; KNL_WIDEN-NEXT: vmovq %xmm2, %rax
-; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2
-; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL_WIDEN-NEXT: vextracti32x4 $2, %zmm0, %xmm3
-; KNL_WIDEN-NEXT: vpextrq $1, %xmm3, %rax
-; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm4
-; KNL_WIDEN-NEXT: vmovq %xmm3, %rax
-; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
-; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; KNL_WIDEN-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; KNL_WIDEN-NEXT: vextracti128 $1, %ymm0, %xmm3
-; KNL_WIDEN-NEXT: vpextrq $1, %xmm3, %rax
-; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
-; KNL_WIDEN-NEXT: vmovq %xmm3, %rax
-; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
-; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; KNL_WIDEN-NEXT: vpextrq $1, %xmm0, %rax
-; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
-; KNL_WIDEN-NEXT: vmovq %xmm0, %rax
-; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm0
-; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; KNL_WIDEN-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; KNL_WIDEN-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; KNL_WIDEN-NEXT: vextracti32x4 $3, %zmm1, %xmm2
-; KNL_WIDEN-NEXT: vpextrq $1, %xmm2, %rax
-; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
-; KNL_WIDEN-NEXT: vmovq %xmm2, %rax
-; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm2
-; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL_WIDEN-NEXT: vextracti32x4 $2, %zmm1, %xmm3
-; KNL_WIDEN-NEXT: vpextrq $1, %xmm3, %rax
-; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
-; KNL_WIDEN-NEXT: vmovq %xmm3, %rax
-; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
-; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; KNL_WIDEN-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; KNL_WIDEN-NEXT: vextracti128 $1, %ymm1, %xmm3
-; KNL_WIDEN-NEXT: vpextrq $1, %xmm3, %rax
-; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
-; KNL_WIDEN-NEXT: vmovq %xmm3, %rax
-; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
-; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; KNL_WIDEN-NEXT: vpextrq $1, %xmm1, %rax
-; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
-; KNL_WIDEN-NEXT: vmovq %xmm1, %rax
-; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm1
-; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
-; KNL_WIDEN-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; KNL_WIDEN-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1
+; KNL_WIDEN-NEXT: vpbroadcastq {{.*#+}} zmm2 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295]
+; KNL_WIDEN-NEXT: vpandq %zmm2, %zmm0, %zmm3
+; KNL_WIDEN-NEXT: vpbroadcastq {{.*#+}} zmm4 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
+; KNL_WIDEN-NEXT: vporq %zmm4, %zmm3, %zmm3
+; KNL_WIDEN-NEXT: vpsrlq $32, %zmm0, %zmm0
+; KNL_WIDEN-NEXT: vpbroadcastq {{.*#+}} zmm5 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
+; KNL_WIDEN-NEXT: vporq %zmm5, %zmm0, %zmm0
+; KNL_WIDEN-NEXT: vbroadcastsd {{.*#+}} zmm6 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
+; KNL_WIDEN-NEXT: vsubpd %zmm6, %zmm0, %zmm0
+; KNL_WIDEN-NEXT: vaddpd %zmm0, %zmm3, %zmm0
+; KNL_WIDEN-NEXT: vpandq %zmm2, %zmm1, %zmm2
+; KNL_WIDEN-NEXT: vporq %zmm4, %zmm2, %zmm2
+; KNL_WIDEN-NEXT: vpsrlq $32, %zmm1, %zmm1
+; KNL_WIDEN-NEXT: vporq %zmm5, %zmm1, %zmm1
+; KNL_WIDEN-NEXT: vsubpd %zmm6, %zmm1, %zmm1
+; KNL_WIDEN-NEXT: vaddpd %zmm1, %zmm2, %zmm1
; KNL_WIDEN-NEXT: retq
%b = uitofp <16 x i64> %a to <16 x double>
ret <16 x double> %b
diff --git a/llvm/test/CodeGen/X86/ftrunc.ll b/llvm/test/CodeGen/X86/ftrunc.ll
index 01112f48bf4..ff40f619853 100644
--- a/llvm/test/CodeGen/X86/ftrunc.ll
+++ b/llvm/test/CodeGen/X86/ftrunc.ll
@@ -106,39 +106,34 @@ define <4 x float> @trunc_unsigned_v4f32(<4 x float> %x) #0 {
define <2 x double> @trunc_unsigned_v2f64(<2 x double> %x) #0 {
; SSE2-LABEL: trunc_unsigned_v2f64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movapd %xmm0, %xmm1
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE2-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
-; SSE2-NEXT: movapd %xmm1, %xmm3
-; SSE2-NEXT: subsd %xmm2, %xmm3
-; SSE2-NEXT: cvttsd2si %xmm3, %rax
-; SSE2-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; SSE2-NEXT: xorq %rcx, %rax
-; SSE2-NEXT: cvttsd2si %xmm1, %rdx
-; SSE2-NEXT: ucomisd %xmm2, %xmm1
-; SSE2-NEXT: cmovaeq %rax, %rdx
; SSE2-NEXT: movapd %xmm0, %xmm1
; SSE2-NEXT: subsd %xmm2, %xmm1
; SSE2-NEXT: cvttsd2si %xmm1, %rax
+; SSE2-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; SSE2-NEXT: xorq %rcx, %rax
+; SSE2-NEXT: cvttsd2si %xmm0, %rdx
+; SSE2-NEXT: ucomisd %xmm2, %xmm0
+; SSE2-NEXT: cmovaeq %rax, %rdx
+; SSE2-NEXT: movq %rdx, %xmm1
+; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT: movapd %xmm0, %xmm3
+; SSE2-NEXT: subsd %xmm2, %xmm3
+; SSE2-NEXT: cvttsd2si %xmm3, %rax
; SSE2-NEXT: xorq %rcx, %rax
; SSE2-NEXT: cvttsd2si %xmm0, %rcx
; SSE2-NEXT: ucomisd %xmm2, %xmm0
; SSE2-NEXT: cmovaeq %rax, %rcx
-; SSE2-NEXT: movq %rcx, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT: movapd {{.*#+}} xmm3 = [4503599627370496,1.9342813113834067E+25]
-; SSE2-NEXT: subpd %xmm3, %xmm1
+; SSE2-NEXT: movq %rcx, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [4294967295,4294967295]
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: por {{.*}}(%rip), %xmm0
+; SSE2-NEXT: psrlq $32, %xmm1
+; SSE2-NEXT: por {{.*}}(%rip), %xmm1
+; SSE2-NEXT: subpd {{.*}}(%rip), %xmm1
+; SSE2-NEXT: addpd %xmm0, %xmm1
; SSE2-NEXT: movapd %xmm1, %xmm0
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE2-NEXT: addpd %xmm1, %xmm0
-; SSE2-NEXT: movq %rdx, %xmm1
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT: subpd %xmm3, %xmm1
-; SSE2-NEXT: movapd %xmm1, %xmm2
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; SSE2-NEXT: addpd %xmm1, %xmm2
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE2-NEXT: retq
;
; SSE41-LABEL: trunc_unsigned_v2f64:
@@ -158,68 +153,62 @@ define <2 x double> @trunc_unsigned_v2f64(<2 x double> %x) #0 {
define <4 x double> @trunc_unsigned_v4f64(<4 x double> %x) #0 {
; SSE2-LABEL: trunc_unsigned_v4f64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movapd %xmm1, %xmm3
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
-; SSE2-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
-; SSE2-NEXT: movapd %xmm3, %xmm4
-; SSE2-NEXT: subsd %xmm2, %xmm4
-; SSE2-NEXT: cvttsd2si %xmm4, %rcx
-; SSE2-NEXT: movabsq $-9223372036854775808, %rdx # imm = 0x8000000000000000
-; SSE2-NEXT: xorq %rdx, %rcx
-; SSE2-NEXT: cvttsd2si %xmm3, %rax
-; SSE2-NEXT: ucomisd %xmm2, %xmm3
-; SSE2-NEXT: cmovaeq %rcx, %rax
-; SSE2-NEXT: movapd %xmm1, %xmm3
-; SSE2-NEXT: subsd %xmm2, %xmm3
-; SSE2-NEXT: cvttsd2si %xmm3, %rsi
-; SSE2-NEXT: xorq %rdx, %rsi
+; SSE2-NEXT: movapd %xmm1, %xmm2
+; SSE2-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
+; SSE2-NEXT: subsd %xmm3, %xmm1
; SSE2-NEXT: cvttsd2si %xmm1, %rcx
-; SSE2-NEXT: ucomisd %xmm2, %xmm1
-; SSE2-NEXT: cmovaeq %rsi, %rcx
-; SSE2-NEXT: movapd %xmm0, %xmm1
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT: movapd %xmm1, %xmm3
-; SSE2-NEXT: subsd %xmm2, %xmm3
-; SSE2-NEXT: cvttsd2si %xmm3, %rsi
-; SSE2-NEXT: xorq %rdx, %rsi
-; SSE2-NEXT: cvttsd2si %xmm1, %rdi
-; SSE2-NEXT: ucomisd %xmm2, %xmm1
-; SSE2-NEXT: cmovaeq %rsi, %rdi
-; SSE2-NEXT: movapd %xmm0, %xmm1
-; SSE2-NEXT: subsd %xmm2, %xmm1
-; SSE2-NEXT: cvttsd2si %xmm1, %rsi
-; SSE2-NEXT: xorq %rdx, %rsi
-; SSE2-NEXT: cvttsd2si %xmm0, %rdx
-; SSE2-NEXT: ucomisd %xmm2, %xmm0
-; SSE2-NEXT: cmovaeq %rsi, %rdx
+; SSE2-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; SSE2-NEXT: xorq %rax, %rcx
+; SSE2-NEXT: cvttsd2si %xmm2, %rdx
+; SSE2-NEXT: ucomisd %xmm3, %xmm2
+; SSE2-NEXT: cmovaeq %rcx, %rdx
; SSE2-NEXT: movq %rdx, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT: movapd {{.*#+}} xmm3 = [4503599627370496,1.9342813113834067E+25]
-; SSE2-NEXT: subpd %xmm3, %xmm1
-; SSE2-NEXT: movapd %xmm1, %xmm0
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE2-NEXT: addpd %xmm1, %xmm0
-; SSE2-NEXT: movq %rdi, %xmm1
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT: subpd %xmm3, %xmm1
-; SSE2-NEXT: movapd %xmm1, %xmm4
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
-; SSE2-NEXT: addpd %xmm1, %xmm4
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; SSE2-NEXT: movq %rcx, %xmm4
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE2-NEXT: subpd %xmm3, %xmm4
-; SSE2-NEXT: movapd %xmm4, %xmm1
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
-; SSE2-NEXT: addpd %xmm4, %xmm1
-; SSE2-NEXT: movq %rax, %xmm4
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE2-NEXT: subpd %xmm3, %xmm4
-; SSE2-NEXT: movapd %xmm4, %xmm2
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1]
-; SSE2-NEXT: addpd %xmm4, %xmm2
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
+; SSE2-NEXT: movapd %xmm2, %xmm4
+; SSE2-NEXT: subsd %xmm3, %xmm4
+; SSE2-NEXT: cvttsd2si %xmm4, %rcx
+; SSE2-NEXT: xorq %rax, %rcx
+; SSE2-NEXT: cvttsd2si %xmm2, %rdx
+; SSE2-NEXT: ucomisd %xmm3, %xmm2
+; SSE2-NEXT: cmovaeq %rcx, %rdx
+; SSE2-NEXT: movq %rdx, %xmm2
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT: movapd %xmm0, %xmm2
+; SSE2-NEXT: subsd %xmm3, %xmm2
+; SSE2-NEXT: cvttsd2si %xmm2, %rcx
+; SSE2-NEXT: xorq %rax, %rcx
+; SSE2-NEXT: cvttsd2si %xmm0, %rdx
+; SSE2-NEXT: ucomisd %xmm3, %xmm0
+; SSE2-NEXT: cmovaeq %rcx, %rdx
+; SSE2-NEXT: movq %rdx, %xmm2
+; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT: movapd %xmm0, %xmm4
+; SSE2-NEXT: subsd %xmm3, %xmm4
+; SSE2-NEXT: cvttsd2si %xmm4, %rcx
+; SSE2-NEXT: xorq %rax, %rcx
+; SSE2-NEXT: cvttsd2si %xmm0, %rax
+; SSE2-NEXT: ucomisd %xmm3, %xmm0
+; SSE2-NEXT: cmovaeq %rcx, %rax
+; SSE2-NEXT: movq %rax, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [4294967295,4294967295]
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
+; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: psrlq $32, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
+; SSE2-NEXT: por %xmm5, %xmm2
+; SSE2-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
+; SSE2-NEXT: subpd %xmm6, %xmm2
+; SSE2-NEXT: addpd %xmm3, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: psrlq $32, %xmm1
+; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: subpd %xmm6, %xmm1
+; SSE2-NEXT: addpd %xmm0, %xmm1
+; SSE2-NEXT: movapd %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: trunc_unsigned_v4f64:
diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
index 2b8ceeba7f3..14cce63ca96 100644
--- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
@@ -497,63 +497,67 @@ define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) {
define <2 x double> @uitofp_2i64_to_2f64(<2 x i64> %a) {
; SSE2-LABEL: uitofp_2i64_to_2f64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: movapd {{.*#+}} xmm4 = [4503599627370496,1.9342813113834067E+25]
-; SSE2-NEXT: subpd %xmm4, %xmm0
-; SSE2-NEXT: movapd %xmm0, %xmm1
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT: addpd %xmm0, %xmm1
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-NEXT: subpd %xmm4, %xmm3
-; SSE2-NEXT: movapd %xmm3, %xmm0
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
-; SSE2-NEXT: addpd %xmm3, %xmm0
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT: movapd %xmm1, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295]
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: por {{.*}}(%rip), %xmm1
+; SSE2-NEXT: psrlq $32, %xmm0
+; SSE2-NEXT: por {{.*}}(%rip), %xmm0
+; SSE2-NEXT: subpd {{.*}}(%rip), %xmm0
+; SSE2-NEXT: addpd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: uitofp_2i64_to_2f64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE41-NEXT: movapd {{.*#+}} xmm3 = [4503599627370496,1.9342813113834067E+25]
-; SSE41-NEXT: subpd %xmm3, %xmm0
-; SSE41-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE41-NEXT: subpd %xmm3, %xmm2
-; SSE41-NEXT: haddpd %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT: por {{.*}}(%rip), %xmm1
+; SSE41-NEXT: psrlq $32, %xmm0
+; SSE41-NEXT: por {{.*}}(%rip), %xmm0
+; SSE41-NEXT: subpd {{.*}}(%rip), %xmm0
+; SSE41-NEXT: addpd %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; VEX-LABEL: uitofp_2i64_to_2f64:
-; VEX: # %bb.0:
-; VEX-NEXT: vmovapd {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
-; VEX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; VEX-NEXT: vmovapd {{.*#+}} xmm3 = [4503599627370496,1.9342813113834067E+25]
-; VEX-NEXT: vsubpd %xmm3, %xmm2, %xmm2
-; VEX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; VEX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; VEX-NEXT: vsubpd %xmm3, %xmm0, %xmm0
-; VEX-NEXT: vhaddpd %xmm0, %xmm2, %xmm0
-; VEX-NEXT: retq
+; AVX1-LABEL: uitofp_2i64_to_2f64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uitofp_2i64_to_2f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: uitofp_2i64_to_2f64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm1, %xmm1
-; AVX512F-NEXT: vmovq %xmm0, %rax
-; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm0
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_2i64_to_2f64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm0
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
+; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_2i64_to_2f64:
@@ -837,104 +841,96 @@ define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) {
define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) {
; SSE2-LABEL: uitofp_4i64_to_4f64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE2-NEXT: movapd {{.*#+}} xmm5 = [4503599627370496,1.9342813113834067E+25]
-; SSE2-NEXT: subpd %xmm5, %xmm2
-; SSE2-NEXT: movapd %xmm2, %xmm0
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
-; SSE2-NEXT: addpd %xmm2, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; SSE2-NEXT: subpd %xmm5, %xmm4
-; SSE2-NEXT: movapd %xmm4, %xmm2
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1]
-; SSE2-NEXT: addpd %xmm4, %xmm2
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-NEXT: subpd %xmm5, %xmm1
-; SSE2-NEXT: movapd %xmm1, %xmm2
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; SSE2-NEXT: addpd %xmm1, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; SSE2-NEXT: subpd %xmm5, %xmm4
-; SSE2-NEXT: movapd %xmm4, %xmm1
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
-; SSE2-NEXT: addpd %xmm4, %xmm1
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSE2-NEXT: movapd %xmm2, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
+; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: psrlq $32, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
+; SSE2-NEXT: subpd %xmm6, %xmm0
+; SSE2-NEXT: addpd %xmm3, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: psrlq $32, %xmm1
+; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: subpd %xmm6, %xmm1
+; SSE2-NEXT: addpd %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: uitofp_4i64_to_4f64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4503599627370496,1.9342813113834067E+25]
-; SSE41-NEXT: subpd %xmm4, %xmm0
-; SSE41-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE41-NEXT: subpd %xmm4, %xmm3
-; SSE41-NEXT: haddpd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
-; SSE41-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE41-NEXT: subpd %xmm4, %xmm1
-; SSE41-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE41-NEXT: subpd %xmm4, %xmm3
-; SSE41-NEXT: haddpd %xmm3, %xmm1
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
+; SSE41-NEXT: por %xmm4, %xmm3
+; SSE41-NEXT: psrlq $32, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
+; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
+; SSE41-NEXT: subpd %xmm6, %xmm0
+; SSE41-NEXT: addpd %xmm3, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE41-NEXT: por %xmm4, %xmm2
+; SSE41-NEXT: psrlq $32, %xmm1
+; SSE41-NEXT: por %xmm5, %xmm1
+; SSE41-NEXT: subpd %xmm6, %xmm1
+; SSE41-NEXT: addpd %xmm2, %xmm1
; SSE41-NEXT: retq
;
-; VEX-LABEL: uitofp_4i64_to_4f64:
-; VEX: # %bb.0:
-; VEX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; VEX-NEXT: vmovapd {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
-; VEX-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; VEX-NEXT: vmovapd {{.*#+}} xmm4 = [4503599627370496,1.9342813113834067E+25]
-; VEX-NEXT: vsubpd %xmm4, %xmm3, %xmm3
-; VEX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; VEX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; VEX-NEXT: vsubpd %xmm4, %xmm1, %xmm1
-; VEX-NEXT: vhaddpd %xmm1, %xmm3, %xmm1
-; VEX-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; VEX-NEXT: vsubpd %xmm4, %xmm3, %xmm3
-; VEX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; VEX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; VEX-NEXT: vsubpd %xmm4, %xmm0, %xmm0
-; VEX-NEXT: vhaddpd %xmm0, %xmm3, %xmm0
-; VEX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; VEX-NEXT: retq
+; AVX1-LABEL: uitofp_4i64_to_4f64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vorpd {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vsubpd {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uitofp_4i64_to_4f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
+; AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
+; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
+; AVX2-NEXT: vsubpd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: uitofp_4i64_to_4f64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2
-; AVX512F-NEXT: vmovq %xmm1, %rax
-; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm2
-; AVX512F-NEXT: vmovq %xmm0, %rax
-; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm0
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
+; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
+; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
+; AVX512F-NEXT: vsubpd %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vaddpd %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_4i64_to_4f64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovq %xmm1, %rax
-; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm2
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm0
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm0, %ymm1
+; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0
+; AVX512VL-NEXT: vsubpd {{.*}}(%rip){1to4}, %ymm0, %ymm0
+; AVX512VL-NEXT: vaddpd %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_4i64_to_4f64:
@@ -3446,67 +3442,73 @@ define <4 x double> @sitofp_load_4i8_to_4f64(<4 x i8> *%a) {
define <2 x double> @uitofp_load_2i64_to_2f64(<2 x i64> *%a) {
; SSE2-LABEL: uitofp_load_2i64_to_2f64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT: movapd {{.*#+}} xmm4 = [4503599627370496,1.9342813113834067E+25]
-; SSE2-NEXT: subpd %xmm4, %xmm1
-; SSE2-NEXT: movapd %xmm1, %xmm0
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE2-NEXT: movdqa (%rdi), %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295]
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: por {{.*}}(%rip), %xmm1
+; SSE2-NEXT: psrlq $32, %xmm0
+; SSE2-NEXT: por {{.*}}(%rip), %xmm0
+; SSE2-NEXT: subpd {{.*}}(%rip), %xmm0
; SSE2-NEXT: addpd %xmm1, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-NEXT: subpd %xmm4, %xmm3
-; SSE2-NEXT: movapd %xmm3, %xmm1
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
-; SSE2-NEXT: addpd %xmm3, %xmm1
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: retq
;
; SSE41-LABEL: uitofp_load_2i64_to_2f64:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa (%rdi), %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE41-NEXT: movapd {{.*#+}} xmm3 = [4503599627370496,1.9342813113834067E+25]
-; SSE41-NEXT: subpd %xmm3, %xmm0
-; SSE41-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE41-NEXT: subpd %xmm3, %xmm2
-; SSE41-NEXT: haddpd %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT: por {{.*}}(%rip), %xmm1
+; SSE41-NEXT: psrlq $32, %xmm0
+; SSE41-NEXT: por {{.*}}(%rip), %xmm0
+; SSE41-NEXT: subpd {{.*}}(%rip), %xmm0
+; SSE41-NEXT: addpd %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; VEX-LABEL: uitofp_load_2i64_to_2f64:
-; VEX: # %bb.0:
-; VEX-NEXT: vmovapd (%rdi), %xmm0
-; VEX-NEXT: vmovapd {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
-; VEX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; VEX-NEXT: vmovapd {{.*#+}} xmm3 = [4503599627370496,1.9342813113834067E+25]
-; VEX-NEXT: vsubpd %xmm3, %xmm2, %xmm2
-; VEX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; VEX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; VEX-NEXT: vsubpd %xmm3, %xmm0, %xmm0
-; VEX-NEXT: vhaddpd %xmm0, %xmm2, %xmm0
-; VEX-NEXT: retq
+; AVX1-LABEL: uitofp_load_2i64_to_2f64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uitofp_load_2i64_to_2f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: uitofp_load_2i64_to_2f64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm1, %xmm1
-; AVX512F-NEXT: vmovq %xmm0, %rax
-; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm0
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_load_2i64_to_2f64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm0
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
+; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_load_2i64_to_2f64:
@@ -3652,109 +3654,104 @@ define <2 x double> @uitofp_load_2i8_to_2f64(<2 x i8> *%a) {
define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) {
; SSE2-LABEL: uitofp_load_4i64_to_4f64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm1
-; SSE2-NEXT: movdqa 16(%rdi), %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-NEXT: movapd {{.*#+}} xmm5 = [4503599627370496,1.9342813113834067E+25]
-; SSE2-NEXT: subpd %xmm5, %xmm1
-; SSE2-NEXT: movapd %xmm1, %xmm0
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE2-NEXT: addpd %xmm1, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; SSE2-NEXT: subpd %xmm5, %xmm4
-; SSE2-NEXT: movapd %xmm4, %xmm1
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
-; SSE2-NEXT: addpd %xmm4, %xmm1
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE2-NEXT: subpd %xmm5, %xmm2
-; SSE2-NEXT: movapd %xmm2, %xmm1
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
+; SSE2-NEXT: movdqa (%rdi), %xmm0
+; SSE2-NEXT: movdqa 16(%rdi), %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
+; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: psrlq $32, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
+; SSE2-NEXT: subpd %xmm6, %xmm0
+; SSE2-NEXT: addpd %xmm3, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: psrlq $32, %xmm1
+; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: subpd %xmm6, %xmm1
; SSE2-NEXT: addpd %xmm2, %xmm1
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; SSE2-NEXT: subpd %xmm5, %xmm4
-; SSE2-NEXT: movapd %xmm4, %xmm2
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1]
-; SSE2-NEXT: addpd %xmm4, %xmm2
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE2-NEXT: retq
;
; SSE41-LABEL: uitofp_load_4i64_to_4f64:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa (%rdi), %xmm0
; SSE41-NEXT: movdqa 16(%rdi), %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4503599627370496,1.9342813113834067E+25]
-; SSE41-NEXT: subpd %xmm4, %xmm0
-; SSE41-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE41-NEXT: subpd %xmm4, %xmm3
-; SSE41-NEXT: haddpd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
-; SSE41-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE41-NEXT: subpd %xmm4, %xmm1
-; SSE41-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE41-NEXT: subpd %xmm4, %xmm3
-; SSE41-NEXT: haddpd %xmm3, %xmm1
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
+; SSE41-NEXT: por %xmm4, %xmm3
+; SSE41-NEXT: psrlq $32, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
+; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
+; SSE41-NEXT: subpd %xmm6, %xmm0
+; SSE41-NEXT: addpd %xmm3, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE41-NEXT: por %xmm4, %xmm2
+; SSE41-NEXT: psrlq $32, %xmm1
+; SSE41-NEXT: por %xmm5, %xmm1
+; SSE41-NEXT: subpd %xmm6, %xmm1
+; SSE41-NEXT: addpd %xmm2, %xmm1
; SSE41-NEXT: retq
;
-; VEX-LABEL: uitofp_load_4i64_to_4f64:
-; VEX: # %bb.0:
-; VEX-NEXT: vmovapd (%rdi), %ymm0
-; VEX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; VEX-NEXT: vmovapd {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
-; VEX-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; VEX-NEXT: vmovapd {{.*#+}} xmm4 = [4503599627370496,1.9342813113834067E+25]
-; VEX-NEXT: vsubpd %xmm4, %xmm3, %xmm3
-; VEX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; VEX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; VEX-NEXT: vsubpd %xmm4, %xmm1, %xmm1
-; VEX-NEXT: vhaddpd %xmm1, %xmm3, %xmm1
-; VEX-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; VEX-NEXT: vsubpd %xmm4, %xmm3, %xmm3
-; VEX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; VEX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; VEX-NEXT: vsubpd %xmm4, %xmm0, %xmm0
-; VEX-NEXT: vhaddpd %xmm0, %xmm3, %xmm0
-; VEX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; VEX-NEXT: retq
+; AVX1-LABEL: uitofp_load_4i64_to_4f64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vorpd {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vsubpd {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uitofp_load_4i64_to_4f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
+; AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
+; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
+; AVX2-NEXT: vsubpd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: uitofp_load_4i64_to_4f64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2
-; AVX512F-NEXT: vmovq %xmm1, %rax
-; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm2
-; AVX512F-NEXT: vmovq %xmm0, %rax
-; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm0
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
+; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
+; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
+; AVX512F-NEXT: vsubpd %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vaddpd %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_load_4i64_to_4f64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovq %xmm1, %rax
-; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm2
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm0
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm0, %ymm1
+; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0
+; AVX512VL-NEXT: vsubpd {{.*}}(%rip){1to4}, %ymm0, %ymm0
+; AVX512VL-NEXT: vaddpd %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_load_4i64_to_4f64:
OpenPOWER on IntegriCloud