summaryrefslogtreecommitdiffstats
path: root/llvm/test
diff options
context:
space:
mode:
authorCraig Topper <craig.topper@intel.com>2018-07-16 06:56:09 +0000
committerCraig Topper <craig.topper@intel.com>2018-07-16 06:56:09 +0000
commit07a1787501fc7141c8788421c265ae95008f7c13 (patch)
treef6487af0805348b319bf1985c5e7ea1b3821433c /llvm/test
parent92e26613c6af2c9ebc959de7d02c18be10cd941b (diff)
downloadbcm5719-llvm-07a1787501fc7141c8788421c265ae95008f7c13.tar.gz
bcm5719-llvm-07a1787501fc7141c8788421c265ae95008f7c13.zip
[X86] Merge the FR128 and VR128 regclass since they have identical spill and alignment characteristics.
This unfortunately requires a bunch of bitcasts to be added added to SUBREG_TO_REG, COPY_TO_REGCLASS, and instructions in output patterns. Otherwise tablegen seems to default to picking f128 and then we fail when something tries to get the register class for f128 which isn't always valid. The test changes are because we were previously mixing fr128 and vr128 due to contrainRegClass finding FR128 first and passes like live range shrinking weren't handling that well. llvm-svn: 337147
Diffstat (limited to 'llvm/test')
-rw-r--r--llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll52
-rw-r--r--llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll160
-rw-r--r--llvm/test/CodeGen/X86/avx512-intrinsics.ll140
-rw-r--r--llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll4
-rw-r--r--llvm/test/CodeGen/X86/avx512dq-intrinsics.ll8
-rw-r--r--llvm/test/CodeGen/X86/avx512vl-intrinsics.ll104
-rw-r--r--llvm/test/CodeGen/X86/buildvec-insertvec.ll8
-rw-r--r--llvm/test/CodeGen/X86/domain-reassignment.mir8
-rw-r--r--llvm/test/CodeGen/X86/haddsub-2.ll44
-rw-r--r--llvm/test/CodeGen/X86/half.ll16
-rw-r--r--llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll16
-rw-r--r--llvm/test/CodeGen/X86/pr29112.ll20
-rw-r--r--llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll52
-rw-r--r--llvm/test/CodeGen/X86/sse1.ll8
-rw-r--r--llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll77
-rw-r--r--llvm/test/CodeGen/X86/var-permute-128.ll34
-rw-r--r--llvm/test/CodeGen/X86/vec_int_to_fp.ll84
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll20
-rw-r--r--llvm/test/CodeGen/X86/vector-sqrt.ll12
19 files changed, 425 insertions, 442 deletions
diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
index 6277fdd5d09..6e58ffe0962 100644
--- a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
@@ -1772,11 +1772,11 @@ define <4 x double> @test_mm256_set_pd(double %a0, double %a1, double %a2, doubl
; X86: # %bb.0:
; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; X86-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; X86-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; X86-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X86-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; X86-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: test_mm256_set_pd:
@@ -1797,19 +1797,19 @@ define <8 x float> @test_mm256_set_ps(float %a0, float %a1, float %a2, float %a3
; X86: # %bb.0:
; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X86-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; X86-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; X86-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
-; X86-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero
-; X86-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero
-; X86-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
-; X86-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3]
-; X86-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0]
; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
-; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
-; X86-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
+; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: test_mm256_set_ps:
@@ -2391,10 +2391,10 @@ define <4 x double> @test_mm256_setr_pd(double %a0, double %a1, double %a2, doub
; X86: # %bb.0:
; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; X86-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
; X86-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; X86-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; X86-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; X86-NEXT: retl
;
@@ -2418,16 +2418,16 @@ define <8 x float> @test_mm256_setr_ps(float %a0, float %a1, float %a2, float %a
; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X86-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; X86-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; X86-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
-; X86-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero
-; X86-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero
; X86-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[2,3]
-; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3]
-; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X86-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; X86-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
+; X86-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3]
+; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; X86-NEXT: retl
;
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
index 55049ead617..048209293df 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -8798,12 +8798,12 @@ define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x do
; X86-NEXT: vmovapd %xmm0, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe0]
; X86-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm4 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa9,0xe2]
; X86-NEXT: ## xmm4 = (xmm1 * xmm4) + xmm2
-; X86-NEXT: vmovapd %xmm0, %xmm5 ## encoding: [0xc5,0xf9,0x28,0xe8]
-; X86-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm5 ## encoding: [0x62,0xf2,0xf5,0x78,0xa9,0xea]
+; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0x58,0xdc]
+; X86-NEXT: vmovapd %xmm0, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe0]
+; X86-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm4 ## encoding: [0x62,0xf2,0xf5,0x78,0xa9,0xe2]
; X86-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x79,0xa9,0xc2]
-; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm1 ## encoding: [0xc5,0xe1,0x58,0xcc]
-; X86-NEXT: vaddpd %xmm0, %xmm5, %xmm0 ## encoding: [0xc5,0xd1,0x58,0xc0]
-; X86-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf1,0x58,0xc0]
+; X86-NEXT: vaddpd %xmm0, %xmm4, %xmm0 ## encoding: [0xc5,0xd9,0x58,0xc0]
+; X86-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc0]
; X86-NEXT: retl ## encoding: [0xc3]
;
; X64-LABEL: test_int_x86_avx512_mask_vfmadd_sd:
@@ -8815,12 +8815,12 @@ define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x do
; X64-NEXT: vmovapd %xmm0, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe0]
; X64-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm4 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa9,0xe2]
; X64-NEXT: ## xmm4 = (xmm1 * xmm4) + xmm2
-; X64-NEXT: vmovapd %xmm0, %xmm5 ## encoding: [0xc5,0xf9,0x28,0xe8]
-; X64-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm5 ## encoding: [0x62,0xf2,0xf5,0x78,0xa9,0xea]
+; X64-NEXT: vaddpd %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0x58,0xdc]
+; X64-NEXT: vmovapd %xmm0, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe0]
+; X64-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm4 ## encoding: [0x62,0xf2,0xf5,0x78,0xa9,0xe2]
; X64-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x79,0xa9,0xc2]
-; X64-NEXT: vaddpd %xmm4, %xmm3, %xmm1 ## encoding: [0xc5,0xe1,0x58,0xcc]
-; X64-NEXT: vaddpd %xmm0, %xmm5, %xmm0 ## encoding: [0xc5,0xd1,0x58,0xc0]
-; X64-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf1,0x58,0xc0]
+; X64-NEXT: vaddpd %xmm0, %xmm4, %xmm0 ## encoding: [0xc5,0xd9,0x58,0xc0]
+; X64-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc0]
; X64-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
%res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
@@ -8845,12 +8845,12 @@ define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x floa
; X86-NEXT: vmovaps %xmm0, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe0]
; X86-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm4 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa9,0xe2]
; X86-NEXT: ## xmm4 = (xmm1 * xmm4) + xmm2
-; X86-NEXT: vmovaps %xmm0, %xmm5 ## encoding: [0xc5,0xf8,0x28,0xe8]
-; X86-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm5 ## encoding: [0x62,0xf2,0x75,0x78,0xa9,0xea]
+; X86-NEXT: vaddps %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe0,0x58,0xdc]
+; X86-NEXT: vmovaps %xmm0, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe0]
+; X86-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm4 ## encoding: [0x62,0xf2,0x75,0x78,0xa9,0xe2]
; X86-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x79,0xa9,0xc2]
-; X86-NEXT: vaddps %xmm4, %xmm3, %xmm1 ## encoding: [0xc5,0xe0,0x58,0xcc]
-; X86-NEXT: vaddps %xmm0, %xmm5, %xmm0 ## encoding: [0xc5,0xd0,0x58,0xc0]
-; X86-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf0,0x58,0xc0]
+; X86-NEXT: vaddps %xmm0, %xmm4, %xmm0 ## encoding: [0xc5,0xd8,0x58,0xc0]
+; X86-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc0]
; X86-NEXT: retl ## encoding: [0xc3]
;
; X64-LABEL: test_int_x86_avx512_mask_vfmadd_ss:
@@ -8862,12 +8862,12 @@ define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x floa
; X64-NEXT: vmovaps %xmm0, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe0]
; X64-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm4 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa9,0xe2]
; X64-NEXT: ## xmm4 = (xmm1 * xmm4) + xmm2
-; X64-NEXT: vmovaps %xmm0, %xmm5 ## encoding: [0xc5,0xf8,0x28,0xe8]
-; X64-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm5 ## encoding: [0x62,0xf2,0x75,0x78,0xa9,0xea]
+; X64-NEXT: vaddps %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe0,0x58,0xdc]
+; X64-NEXT: vmovaps %xmm0, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe0]
+; X64-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm4 ## encoding: [0x62,0xf2,0x75,0x78,0xa9,0xe2]
; X64-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x79,0xa9,0xc2]
-; X64-NEXT: vaddps %xmm4, %xmm3, %xmm1 ## encoding: [0xc5,0xe0,0x58,0xcc]
-; X64-NEXT: vaddps %xmm0, %xmm5, %xmm0 ## encoding: [0xc5,0xd0,0x58,0xc0]
-; X64-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf0,0x58,0xc0]
+; X64-NEXT: vaddps %xmm0, %xmm4, %xmm0 ## encoding: [0xc5,0xd8,0x58,0xc0]
+; X64-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc0]
; X64-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
%res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
@@ -8943,12 +8943,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x d
; X86-NEXT: vmovapd %xmm2, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe2]
; X86-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xb9,0xe1]
; X86-NEXT: ## xmm4 = (xmm0 * xmm1) + xmm4
-; X86-NEXT: vmovapd %xmm2, %xmm5 ## encoding: [0xc5,0xf9,0x28,0xea]
-; X86-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0xfd,0x78,0xb9,0xe9]
+; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0x58,0xdc]
+; X86-NEXT: vmovapd %xmm2, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe2]
+; X86-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm4 ## encoding: [0x62,0xf2,0xfd,0x78,0xb9,0xe1]
; X86-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x79,0xb9,0xd1]
-; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc4]
-; X86-NEXT: vaddpd %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd1,0x58,0xca]
-; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x58,0xc1]
+; X86-NEXT: vaddpd %xmm2, %xmm4, %xmm0 ## encoding: [0xc5,0xd9,0x58,0xc2]
+; X86-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc0]
; X86-NEXT: retl ## encoding: [0xc3]
;
; X64-LABEL: test_int_x86_avx512_mask3_vfmadd_sd:
@@ -8960,12 +8960,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x d
; X64-NEXT: vmovapd %xmm2, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe2]
; X64-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xb9,0xe1]
; X64-NEXT: ## xmm4 = (xmm0 * xmm1) + xmm4
-; X64-NEXT: vmovapd %xmm2, %xmm5 ## encoding: [0xc5,0xf9,0x28,0xea]
-; X64-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0xfd,0x78,0xb9,0xe9]
+; X64-NEXT: vaddpd %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0x58,0xdc]
+; X64-NEXT: vmovapd %xmm2, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe2]
+; X64-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm4 ## encoding: [0x62,0xf2,0xfd,0x78,0xb9,0xe1]
; X64-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x79,0xb9,0xd1]
-; X64-NEXT: vaddpd %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc4]
-; X64-NEXT: vaddpd %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd1,0x58,0xca]
-; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x58,0xc1]
+; X64-NEXT: vaddpd %xmm2, %xmm4, %xmm0 ## encoding: [0xc5,0xd9,0x58,0xc2]
+; X64-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc0]
; X64-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
%res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
@@ -8990,12 +8990,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x flo
; X86-NEXT: vmovaps %xmm2, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe2]
; X86-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xb9,0xe1]
; X86-NEXT: ## xmm4 = (xmm0 * xmm1) + xmm4
-; X86-NEXT: vmovaps %xmm2, %xmm5 ## encoding: [0xc5,0xf8,0x28,0xea]
-; X86-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0x7d,0x78,0xb9,0xe9]
+; X86-NEXT: vaddps %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe0,0x58,0xdc]
+; X86-NEXT: vmovaps %xmm2, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe2]
+; X86-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm4 ## encoding: [0x62,0xf2,0x7d,0x78,0xb9,0xe1]
; X86-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x79,0xb9,0xd1]
-; X86-NEXT: vaddps %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc4]
-; X86-NEXT: vaddps %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd0,0x58,0xca]
-; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
+; X86-NEXT: vaddps %xmm2, %xmm4, %xmm0 ## encoding: [0xc5,0xd8,0x58,0xc2]
+; X86-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc0]
; X86-NEXT: retl ## encoding: [0xc3]
;
; X64-LABEL: test_int_x86_avx512_mask3_vfmadd_ss:
@@ -9007,12 +9007,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x flo
; X64-NEXT: vmovaps %xmm2, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe2]
; X64-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xb9,0xe1]
; X64-NEXT: ## xmm4 = (xmm0 * xmm1) + xmm4
-; X64-NEXT: vmovaps %xmm2, %xmm5 ## encoding: [0xc5,0xf8,0x28,0xea]
-; X64-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0x7d,0x78,0xb9,0xe9]
+; X64-NEXT: vaddps %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe0,0x58,0xdc]
+; X64-NEXT: vmovaps %xmm2, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe2]
+; X64-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm4 ## encoding: [0x62,0xf2,0x7d,0x78,0xb9,0xe1]
; X64-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x79,0xb9,0xd1]
-; X64-NEXT: vaddps %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc4]
-; X64-NEXT: vaddps %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd0,0x58,0xca]
-; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
+; X64-NEXT: vaddps %xmm2, %xmm4, %xmm0 ## encoding: [0xc5,0xd8,0x58,0xc2]
+; X64-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc0]
; X64-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
%res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
@@ -9213,12 +9213,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x d
; X86-NEXT: vmovapd %xmm2, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe2]
; X86-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbb,0xe1]
; X86-NEXT: ## xmm4 = (xmm0 * xmm1) - xmm4
-; X86-NEXT: vmovapd %xmm2, %xmm5 ## encoding: [0xc5,0xf9,0x28,0xea]
-; X86-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0xfd,0x78,0xbb,0xe9]
+; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0x58,0xdc]
+; X86-NEXT: vmovapd %xmm2, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe2]
+; X86-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4 ## encoding: [0x62,0xf2,0xfd,0x78,0xbb,0xe1]
; X86-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x79,0xbb,0xd1]
-; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc4]
-; X86-NEXT: vaddpd %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd1,0x58,0xca]
-; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x58,0xc1]
+; X86-NEXT: vaddpd %xmm2, %xmm4, %xmm0 ## encoding: [0xc5,0xd9,0x58,0xc2]
+; X86-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc0]
; X86-NEXT: retl ## encoding: [0xc3]
;
; X64-LABEL: test_int_x86_avx512_mask3_vfmsub_sd:
@@ -9230,12 +9230,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x d
; X64-NEXT: vmovapd %xmm2, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe2]
; X64-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbb,0xe1]
; X64-NEXT: ## xmm4 = (xmm0 * xmm1) - xmm4
-; X64-NEXT: vmovapd %xmm2, %xmm5 ## encoding: [0xc5,0xf9,0x28,0xea]
-; X64-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0xfd,0x78,0xbb,0xe9]
+; X64-NEXT: vaddpd %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0x58,0xdc]
+; X64-NEXT: vmovapd %xmm2, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe2]
+; X64-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4 ## encoding: [0x62,0xf2,0xfd,0x78,0xbb,0xe1]
; X64-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x79,0xbb,0xd1]
-; X64-NEXT: vaddpd %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc4]
-; X64-NEXT: vaddpd %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd1,0x58,0xca]
-; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x58,0xc1]
+; X64-NEXT: vaddpd %xmm2, %xmm4, %xmm0 ## encoding: [0xc5,0xd9,0x58,0xc2]
+; X64-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc0]
; X64-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
%res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
@@ -9260,12 +9260,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x flo
; X86-NEXT: vmovaps %xmm2, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe2]
; X86-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbb,0xe1]
; X86-NEXT: ## xmm4 = (xmm0 * xmm1) - xmm4
-; X86-NEXT: vmovaps %xmm2, %xmm5 ## encoding: [0xc5,0xf8,0x28,0xea]
-; X86-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0x7d,0x78,0xbb,0xe9]
+; X86-NEXT: vaddps %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe0,0x58,0xdc]
+; X86-NEXT: vmovaps %xmm2, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe2]
+; X86-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4 ## encoding: [0x62,0xf2,0x7d,0x78,0xbb,0xe1]
; X86-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x79,0xbb,0xd1]
-; X86-NEXT: vaddps %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc4]
-; X86-NEXT: vaddps %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd0,0x58,0xca]
-; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
+; X86-NEXT: vaddps %xmm2, %xmm4, %xmm0 ## encoding: [0xc5,0xd8,0x58,0xc2]
+; X86-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc0]
; X86-NEXT: retl ## encoding: [0xc3]
;
; X64-LABEL: test_int_x86_avx512_mask3_vfmsub_ss:
@@ -9277,12 +9277,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x flo
; X64-NEXT: vmovaps %xmm2, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe2]
; X64-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbb,0xe1]
; X64-NEXT: ## xmm4 = (xmm0 * xmm1) - xmm4
-; X64-NEXT: vmovaps %xmm2, %xmm5 ## encoding: [0xc5,0xf8,0x28,0xea]
-; X64-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0x7d,0x78,0xbb,0xe9]
+; X64-NEXT: vaddps %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe0,0x58,0xdc]
+; X64-NEXT: vmovaps %xmm2, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe2]
+; X64-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4 ## encoding: [0x62,0xf2,0x7d,0x78,0xbb,0xe1]
; X64-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x79,0xbb,0xd1]
-; X64-NEXT: vaddps %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc4]
-; X64-NEXT: vaddps %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd0,0x58,0xca]
-; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
+; X64-NEXT: vaddps %xmm2, %xmm4, %xmm0 ## encoding: [0xc5,0xd8,0x58,0xc2]
+; X64-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc0]
; X64-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
%res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
@@ -9307,12 +9307,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x
; X86-NEXT: vmovapd %xmm2, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe2]
; X86-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbf,0xe1]
; X86-NEXT: ## xmm4 = -(xmm0 * xmm1) - xmm4
-; X86-NEXT: vmovapd %xmm2, %xmm5 ## encoding: [0xc5,0xf9,0x28,0xea]
-; X86-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0xfd,0x78,0xbf,0xe9]
+; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0x58,0xdc]
+; X86-NEXT: vmovapd %xmm2, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe2]
+; X86-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4 ## encoding: [0x62,0xf2,0xfd,0x78,0xbf,0xe1]
; X86-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x79,0xbf,0xd1]
-; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc4]
-; X86-NEXT: vaddpd %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd1,0x58,0xca]
-; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x58,0xc1]
+; X86-NEXT: vaddpd %xmm2, %xmm4, %xmm0 ## encoding: [0xc5,0xd9,0x58,0xc2]
+; X86-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc0]
; X86-NEXT: retl ## encoding: [0xc3]
;
; X64-LABEL: test_int_x86_avx512_mask3_vfnmsub_sd:
@@ -9324,12 +9324,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x
; X64-NEXT: vmovapd %xmm2, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe2]
; X64-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbf,0xe1]
; X64-NEXT: ## xmm4 = -(xmm0 * xmm1) - xmm4
-; X64-NEXT: vmovapd %xmm2, %xmm5 ## encoding: [0xc5,0xf9,0x28,0xea]
-; X64-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0xfd,0x78,0xbf,0xe9]
+; X64-NEXT: vaddpd %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0x58,0xdc]
+; X64-NEXT: vmovapd %xmm2, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe2]
+; X64-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4 ## encoding: [0x62,0xf2,0xfd,0x78,0xbf,0xe1]
; X64-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x79,0xbf,0xd1]
-; X64-NEXT: vaddpd %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc4]
-; X64-NEXT: vaddpd %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd1,0x58,0xca]
-; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x58,0xc1]
+; X64-NEXT: vaddpd %xmm2, %xmm4, %xmm0 ## encoding: [0xc5,0xd9,0x58,0xc2]
+; X64-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc0]
; X64-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
%res1 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
@@ -9354,12 +9354,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x fl
; X86-NEXT: vmovaps %xmm2, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe2]
; X86-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbf,0xe1]
; X86-NEXT: ## xmm4 = -(xmm0 * xmm1) - xmm4
-; X86-NEXT: vmovaps %xmm2, %xmm5 ## encoding: [0xc5,0xf8,0x28,0xea]
-; X86-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0x7d,0x78,0xbf,0xe9]
+; X86-NEXT: vaddps %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe0,0x58,0xdc]
+; X86-NEXT: vmovaps %xmm2, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe2]
+; X86-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4 ## encoding: [0x62,0xf2,0x7d,0x78,0xbf,0xe1]
; X86-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x79,0xbf,0xd1]
-; X86-NEXT: vaddps %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc4]
-; X86-NEXT: vaddps %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd0,0x58,0xca]
-; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
+; X86-NEXT: vaddps %xmm2, %xmm4, %xmm0 ## encoding: [0xc5,0xd8,0x58,0xc2]
+; X86-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc0]
; X86-NEXT: retl ## encoding: [0xc3]
;
; X64-LABEL: test_int_x86_avx512_mask3_vfnmsub_ss:
@@ -9371,12 +9371,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x fl
; X64-NEXT: vmovaps %xmm2, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe2]
; X64-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbf,0xe1]
; X64-NEXT: ## xmm4 = -(xmm0 * xmm1) - xmm4
-; X64-NEXT: vmovaps %xmm2, %xmm5 ## encoding: [0xc5,0xf8,0x28,0xea]
-; X64-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0x7d,0x78,0xbf,0xe9]
+; X64-NEXT: vaddps %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe0,0x58,0xdc]
+; X64-NEXT: vmovaps %xmm2, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe2]
+; X64-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4 ## encoding: [0x62,0xf2,0x7d,0x78,0xbf,0xe1]
; X64-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x79,0xbf,0xd1]
-; X64-NEXT: vaddps %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc4]
-; X64-NEXT: vaddps %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd0,0x58,0xca]
-; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
+; X64-NEXT: vaddps %xmm2, %xmm4, %xmm0 ## encoding: [0xc5,0xd8,0x58,0xc2]
+; X64-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc0]
; X64-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
%res1 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll
index 1b3066086b0..8494bf2924b 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll
@@ -574,11 +574,11 @@ define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %
; CHECK-NEXT: vmovaps %xmm2, %xmm3
; CHECK-NEXT: vsqrtss %xmm1, %xmm0, %xmm3 {%k1}
; CHECK-NEXT: vsqrtss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
+; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm2
+; CHECK-NEXT: vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z}
; CHECK-NEXT: vsqrtss {rz-sae}, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm1
-; CHECK-NEXT: vaddps %xmm0, %xmm4, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0
; CHECK-NEXT: retq
%res0 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
%res1 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 1)
@@ -600,11 +600,11 @@ define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x doubl
; CHECK-NEXT: vmovapd %xmm2, %xmm3
; CHECK-NEXT: vsqrtsd %xmm1, %xmm0, %xmm3 {%k1}
; CHECK-NEXT: vsqrtsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
+; CHECK-NEXT: vaddpd %xmm2, %xmm3, %xmm2
+; CHECK-NEXT: vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z}
; CHECK-NEXT: vsqrtsd {rz-sae}, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddpd %xmm2, %xmm3, %xmm1
-; CHECK-NEXT: vaddpd %xmm0, %xmm4, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0
; CHECK-NEXT: retq
%res0 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
%res1 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 1)
@@ -2605,8 +2605,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_512(<8 x i64> %x0, <16 x i8> %
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovqb %zmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpmovqb %zmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovqb %zmm0, %xmm0
; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vpmovqb %zmm0, %xmm0
; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -2641,8 +2641,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_512(<8 x i64> %x0, <16 x i8>
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovsqb %zmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpmovsqb %zmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsqb %zmm0, %xmm0
; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vpmovsqb %zmm0, %xmm0
; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -2677,8 +2677,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_512(<8 x i64> %x0, <16 x i8>
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovusqb %zmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpmovusqb %zmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovusqb %zmm0, %xmm0
; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vpmovusqb %zmm0, %xmm0
; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -2713,8 +2713,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_512(<8 x i64> %x0, <8 x i16> %
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovqw %zmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpmovqw %zmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovqw %zmm0, %xmm0
; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vpmovqw %zmm0, %xmm0
; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -2749,8 +2749,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_512(<8 x i64> %x0, <8 x i16>
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovsqw %zmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpmovsqw %zmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsqw %zmm0, %xmm0
; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vpmovsqw %zmm0, %xmm0
; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -2785,8 +2785,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_512(<8 x i64> %x0, <8 x i16>
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovusqw %zmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpmovusqw %zmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovusqw %zmm0, %xmm0
; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vpmovusqw %zmm0, %xmm0
; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -2926,8 +2926,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_db_512(<16 x i32> %x0, <16 x i8>
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovdb %zmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpmovdb %zmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovdb %zmm0, %xmm0
; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vpmovdb %zmm0, %xmm0
; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -2962,8 +2962,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_512(<16 x i32> %x0, <16 x i8>
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovsdb %zmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpmovsdb %zmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsdb %zmm0, %xmm0
; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vpmovsdb %zmm0, %xmm0
; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -2998,8 +2998,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_512(<16 x i32> %x0, <16 x i8
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovusdb %zmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpmovusdb %zmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovusdb %zmm0, %xmm0
; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vpmovusdb %zmm0, %xmm0
; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -3333,10 +3333,10 @@ define <4 x float> @test_getexp_ss(<4 x float> %a0, <4 x float> %a1, <4 x float>
; CHECK-NEXT: vgetexpss %xmm1, %xmm0, %xmm3 {%k1}
; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm5
+; CHECK-NEXT: vaddps %xmm5, %xmm4, %xmm4
; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm2 {%k1}
; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm0
-; CHECK-NEXT: vaddps %xmm5, %xmm4, %xmm1
-; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddps %xmm4, %xmm0, %xmm0
; CHECK-NEXT: retq
%res0 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
%res1 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
@@ -3359,10 +3359,10 @@ define <2 x double> @test_getexp_sd(<2 x double> %a0, <2 x double> %a1, <2 x dou
; CHECK-NEXT: vmovapd %xmm2, %xmm4
; CHECK-NEXT: vgetexpsd %xmm1, %xmm0, %xmm4 {%k1}
; CHECK-NEXT: vgetexpsd {sae}, %xmm1, %xmm0, %xmm5 {%k1} {z}
+; CHECK-NEXT: vaddpd %xmm3, %xmm5, %xmm3
; CHECK-NEXT: vgetexpsd {sae}, %xmm1, %xmm0, %xmm2 {%k1}
; CHECK-NEXT: vaddpd %xmm2, %xmm4, %xmm0
-; CHECK-NEXT: vaddpd %xmm3, %xmm5, %xmm1
-; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddpd %xmm3, %xmm0, %xmm0
; CHECK-NEXT: retq
%res0 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
%res1 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
@@ -3505,10 +3505,10 @@ define <2 x double>@test_int_x86_avx512_mask_getmant_sd(<2 x double> %x0, <2 x d
; CHECK-NEXT: vmovapd %xmm2, %xmm4
; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm4 {%k1}
; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm5 {%k1} {z}
+; CHECK-NEXT: vaddpd %xmm5, %xmm4, %xmm4
; CHECK-NEXT: vgetmantsd $11, {sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vaddpd %xmm5, %xmm4, %xmm0
-; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm4, %xmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 %x3, i32 4)
%res1 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> zeroinitializer, i8 %x3, i32 4)
@@ -3529,10 +3529,10 @@ define <4 x float>@test_int_x86_avx512_mask_getmant_ss(<4 x float> %x0, <4 x flo
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm2 {%k1}
; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm4 {%k1} {z}
+; CHECK-NEXT: vaddps %xmm4, %xmm2, %xmm2
; CHECK-NEXT: vgetmantss $11, {sae}, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddps %xmm4, %xmm2, %xmm1
; CHECK-NEXT: vaddps %xmm3, %xmm0, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 %x3, i32 4)
%res1 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> zeroinitializer, i8 %x3, i32 4)
@@ -4203,9 +4203,9 @@ define <4 x float>@test_int_x86_avx512_mask_fixupimm_ss(<4 x float> %x0, <4 x fl
; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
; CHECK-NEXT: vmovaps %xmm0, %xmm5
; CHECK-NEXT: vfixupimmss $5, %xmm4, %xmm1, %xmm5 {%k1}
+; CHECK-NEXT: vaddps %xmm5, %xmm3, %xmm3
; CHECK-NEXT: vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddps %xmm5, %xmm3, %xmm1
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4)
%res1 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 4)
@@ -4328,9 +4328,9 @@ define <2 x double>@test_int_x86_avx512_maskz_fixupimm_sd(<2 x double> %x0, <2 x
; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
; CHECK-NEXT: vmovapd %xmm0, %xmm5
; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm4, %xmm1, %xmm5 {%k1} {z}
+; CHECK-NEXT: vaddpd %xmm5, %xmm3, %xmm3
; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
-; CHECK-NEXT: vaddpd %xmm5, %xmm3, %xmm1
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4)
%res1 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8)
@@ -4351,12 +4351,12 @@ define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x do
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovapd %xmm0, %xmm4
; CHECK-NEXT: vfmadd213sd {{.*#+}} xmm4 = (xmm1 * xmm4) + xmm2
-; CHECK-NEXT: vmovapd %xmm0, %xmm5
-; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm5
+; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm3
+; CHECK-NEXT: vmovapd %xmm0, %xmm4
+; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm4
; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm1
-; CHECK-NEXT: vaddpd %xmm0, %xmm5, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm4, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
; CHECK-NEXT: retq
%1 = extractelement <2 x double> %x0, i64 0
%2 = extractelement <2 x double> %x1, i64 0
@@ -4398,12 +4398,12 @@ define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x floa
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %xmm0, %xmm4
; CHECK-NEXT: vfmadd213ss {{.*#+}} xmm4 = (xmm1 * xmm4) + xmm2
-; CHECK-NEXT: vmovaps %xmm0, %xmm5
-; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm5
+; CHECK-NEXT: vaddps %xmm4, %xmm3, %xmm3
+; CHECK-NEXT: vmovaps %xmm0, %xmm4
+; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm4
; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: vaddps %xmm4, %xmm3, %xmm1
-; CHECK-NEXT: vaddps %xmm0, %xmm5, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm4, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
; CHECK-NEXT: retq
%1 = extractelement <4 x float> %x0, i64 0
%2 = extractelement <4 x float> %x1, i64 0
@@ -4503,12 +4503,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x d
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovapd %xmm2, %xmm4
; CHECK-NEXT: vfmadd231sd {{.*#+}} xmm4 = (xmm0 * xmm1) + xmm4
-; CHECK-NEXT: vmovapd %xmm2, %xmm5
-; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm5
+; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm3
+; CHECK-NEXT: vmovapd %xmm2, %xmm4
+; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm4
; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm0
-; CHECK-NEXT: vaddpd %xmm2, %xmm5, %xmm1
-; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddpd %xmm2, %xmm4, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
; CHECK-NEXT: retq
%1 = extractelement <2 x double> %x0, i64 0
%2 = extractelement <2 x double> %x1, i64 0
@@ -4550,12 +4550,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x flo
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %xmm2, %xmm4
; CHECK-NEXT: vfmadd231ss {{.*#+}} xmm4 = (xmm0 * xmm1) + xmm4
-; CHECK-NEXT: vmovaps %xmm2, %xmm5
-; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm5
+; CHECK-NEXT: vaddps %xmm4, %xmm3, %xmm3
+; CHECK-NEXT: vmovaps %xmm2, %xmm4
+; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm4
; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vaddps %xmm4, %xmm3, %xmm0
-; CHECK-NEXT: vaddps %xmm2, %xmm5, %xmm1
-; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddps %xmm2, %xmm4, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
; CHECK-NEXT: retq
%1 = extractelement <4 x float> %x0, i64 0
%2 = extractelement <4 x float> %x1, i64 0
@@ -4723,12 +4723,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x d
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovapd %xmm2, %xmm4
; CHECK-NEXT: vfmsub231sd {{.*#+}} xmm4 = (xmm0 * xmm1) - xmm4
-; CHECK-NEXT: vmovapd %xmm2, %xmm5
-; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5
+; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm3
+; CHECK-NEXT: vmovapd %xmm2, %xmm4
+; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4
; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm0
-; CHECK-NEXT: vaddpd %xmm2, %xmm5, %xmm1
-; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddpd %xmm2, %xmm4, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
; CHECK-NEXT: retq
%1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
%2 = extractelement <2 x double> %x0, i64 0
@@ -4778,12 +4778,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x flo
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %xmm2, %xmm4
; CHECK-NEXT: vfmsub231ss {{.*#+}} xmm4 = (xmm0 * xmm1) - xmm4
-; CHECK-NEXT: vmovaps %xmm2, %xmm5
-; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5
+; CHECK-NEXT: vaddps %xmm4, %xmm3, %xmm3
+; CHECK-NEXT: vmovaps %xmm2, %xmm4
+; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4
; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vaddps %xmm4, %xmm3, %xmm0
-; CHECK-NEXT: vaddps %xmm2, %xmm5, %xmm1
-; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddps %xmm2, %xmm4, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
; CHECK-NEXT: retq
%1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
%2 = extractelement <4 x float> %x0, i64 0
@@ -4833,12 +4833,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovapd %xmm2, %xmm4
; CHECK-NEXT: vfnmsub231sd {{.*#+}} xmm4 = -(xmm0 * xmm1) - xmm4
-; CHECK-NEXT: vmovapd %xmm2, %xmm5
-; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5
+; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm3
+; CHECK-NEXT: vmovapd %xmm2, %xmm4
+; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4
; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm0
-; CHECK-NEXT: vaddpd %xmm2, %xmm5, %xmm1
-; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddpd %xmm2, %xmm4, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
; CHECK-NEXT: retq
%1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x0
%2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
@@ -4892,12 +4892,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x fl
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %xmm2, %xmm4
; CHECK-NEXT: vfnmsub231ss {{.*#+}} xmm4 = -(xmm0 * xmm1) - xmm4
-; CHECK-NEXT: vmovaps %xmm2, %xmm5
-; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5
+; CHECK-NEXT: vaddps %xmm4, %xmm3, %xmm3
+; CHECK-NEXT: vmovaps %xmm2, %xmm4
+; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4
; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vaddps %xmm4, %xmm3, %xmm0
-; CHECK-NEXT: vaddps %xmm2, %xmm5, %xmm1
-; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddps %xmm2, %xmm4, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
; CHECK-NEXT: retq
%1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x0
%2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
diff --git a/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll
index 4463a00b2b8..d1eb47a9cc7 100644
--- a/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll
@@ -10,8 +10,8 @@ define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_512(<8 x double> %x0,
; X86-NEXT: vextractf128 $1, %ymm0, %xmm2 # encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01]
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
; X86-NEXT: vextractf64x2 $1, %zmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x19,0xc1,0x01]
-; X86-NEXT: vextractf64x2 $1, %zmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x19,0xc0,0x01]
; X86-NEXT: vaddpd %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x58,0xca]
+; X86-NEXT: vextractf64x2 $1, %zmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x19,0xc0,0x01]
; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x58,0xc1]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
@@ -21,8 +21,8 @@ define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_512(<8 x double> %x0,
; X64-NEXT: vextractf128 $1, %ymm0, %xmm2 # encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01]
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vextractf64x2 $1, %zmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x19,0xc1,0x01]
-; X64-NEXT: vextractf64x2 $1, %zmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x19,0xc0,0x01]
; X64-NEXT: vaddpd %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x58,0xca]
+; X64-NEXT: vextractf64x2 $1, %zmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x19,0xc0,0x01]
; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x58,0xc1]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
diff --git a/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll b/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll
index c9bea5862a0..2f97492fe46 100644
--- a/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll
@@ -468,9 +468,9 @@ define <4 x float>@test_int_x86_avx512_mask_range_ss(<4 x float> %x0, <4 x float
; X86-AVX512DQ-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
; X86-AVX512DQ-NEXT: vrangess $4, {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x19,0x51,0xd1,0x04]
; X86-AVX512DQ-NEXT: vrangess $4, {sae}, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x18,0x51,0xd9,0x04]
+; X86-AVX512DQ-NEXT: vaddps %xmm3, %xmm2, %xmm2 # encoding: [0xc5,0xe8,0x58,0xd3]
; X86-AVX512DQ-NEXT: vrangess $4, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7d,0x08,0x51,0xc1,0x04]
-; X86-AVX512DQ-NEXT: vaddps %xmm3, %xmm2, %xmm1 # encoding: [0xc5,0xe8,0x58,0xcb]
-; X86-AVX512DQ-NEXT: vaddps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x58,0xc1]
+; X86-AVX512DQ-NEXT: vaddps %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x58,0xc2]
; X86-AVX512DQ-NEXT: retl # encoding: [0xc3]
;
; X86-AVX512DQVL-LABEL: test_int_x86_avx512_mask_range_ss:
@@ -488,9 +488,9 @@ define <4 x float>@test_int_x86_avx512_mask_range_ss(<4 x float> %x0, <4 x float
; X64-AVX512DQ-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-AVX512DQ-NEXT: vrangess $4, {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x19,0x51,0xd1,0x04]
; X64-AVX512DQ-NEXT: vrangess $4, {sae}, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x18,0x51,0xd9,0x04]
+; X64-AVX512DQ-NEXT: vaddps %xmm3, %xmm2, %xmm2 # encoding: [0xc5,0xe8,0x58,0xd3]
; X64-AVX512DQ-NEXT: vrangess $4, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7d,0x08,0x51,0xc1,0x04]
-; X64-AVX512DQ-NEXT: vaddps %xmm3, %xmm2, %xmm1 # encoding: [0xc5,0xe8,0x58,0xcb]
-; X64-AVX512DQ-NEXT: vaddps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x58,0xc1]
+; X64-AVX512DQ-NEXT: vaddps %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x58,0xc2]
; X64-AVX512DQ-NEXT: retq # encoding: [0xc3]
;
; X64-AVX512DQVL-LABEL: test_int_x86_avx512_mask_range_ss:
diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
index fa3a9951867..f4ec74b66dc 100644
--- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -1620,8 +1620,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_128(<2 x i64> %x0, <16 x i8> %
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
; X86-NEXT: vpmovqb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x32,0xc2]
; X86-NEXT: vpmovqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x32,0xc1]
-; X86-NEXT: vpmovqb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x32,0xc0]
; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca]
+; X86-NEXT: vpmovqb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x32,0xc0]
; X86-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1]
; X86-NEXT: retl # encoding: [0xc3]
;
@@ -1630,8 +1630,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_128(<2 x i64> %x0, <16 x i8> %
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovqb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x32,0xc2]
; X64-NEXT: vpmovqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x32,0xc1]
-; X64-NEXT: vpmovqb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x32,0xc0]
; X64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca]
+; X64-NEXT: vpmovqb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x32,0xc0]
; X64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1)
@@ -1673,8 +1673,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_128(<2 x i64> %x0, <16 x i8>
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
; X86-NEXT: vpmovsqb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x22,0xc2]
; X86-NEXT: vpmovsqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x22,0xc1]
-; X86-NEXT: vpmovsqb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x22,0xc0]
; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca]
+; X86-NEXT: vpmovsqb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x22,0xc0]
; X86-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1]
; X86-NEXT: retl # encoding: [0xc3]
;
@@ -1683,8 +1683,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_128(<2 x i64> %x0, <16 x i8>
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovsqb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x22,0xc2]
; X64-NEXT: vpmovsqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x22,0xc1]
-; X64-NEXT: vpmovsqb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x22,0xc0]
; X64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca]
+; X64-NEXT: vpmovsqb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x22,0xc0]
; X64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1)
@@ -1726,8 +1726,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_128(<2 x i64> %x0, <16 x i8>
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
; X86-NEXT: vpmovusqb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x12,0xc2]
; X86-NEXT: vpmovusqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x12,0xc1]
-; X86-NEXT: vpmovusqb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x12,0xc0]
; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca]
+; X86-NEXT: vpmovusqb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x12,0xc0]
; X86-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1]
; X86-NEXT: retl # encoding: [0xc3]
;
@@ -1736,8 +1736,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_128(<2 x i64> %x0, <16 x i8>
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovusqb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x12,0xc2]
; X64-NEXT: vpmovusqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x12,0xc1]
-; X64-NEXT: vpmovusqb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x12,0xc0]
; X64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca]
+; X64-NEXT: vpmovusqb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x12,0xc0]
; X64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1)
@@ -1779,8 +1779,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_256(<4 x i64> %x0, <16 x i8> %
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
; X86-NEXT: vpmovqb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x32,0xc2]
; X86-NEXT: vpmovqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x32,0xc1]
-; X86-NEXT: vpmovqb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x32,0xc0]
; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca]
+; X86-NEXT: vpmovqb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x32,0xc0]
; X86-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
@@ -1790,8 +1790,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_256(<4 x i64> %x0, <16 x i8> %
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovqb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x32,0xc2]
; X64-NEXT: vpmovqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x32,0xc1]
-; X64-NEXT: vpmovqb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x32,0xc0]
; X64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca]
+; X64-NEXT: vpmovqb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x32,0xc0]
; X64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
@@ -1836,8 +1836,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_256(<4 x i64> %x0, <16 x i8>
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
; X86-NEXT: vpmovsqb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x22,0xc2]
; X86-NEXT: vpmovsqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x22,0xc1]
-; X86-NEXT: vpmovsqb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x22,0xc0]
; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca]
+; X86-NEXT: vpmovsqb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x22,0xc0]
; X86-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
@@ -1847,8 +1847,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_256(<4 x i64> %x0, <16 x i8>
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovsqb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x22,0xc2]
; X64-NEXT: vpmovsqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x22,0xc1]
-; X64-NEXT: vpmovsqb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x22,0xc0]
; X64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca]
+; X64-NEXT: vpmovsqb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x22,0xc0]
; X64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
@@ -1893,8 +1893,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_256(<4 x i64> %x0, <16 x i8>
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
; X86-NEXT: vpmovusqb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x12,0xc2]
; X86-NEXT: vpmovusqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x12,0xc1]
-; X86-NEXT: vpmovusqb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x12,0xc0]
; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca]
+; X86-NEXT: vpmovusqb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x12,0xc0]
; X86-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
@@ -1904,8 +1904,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_256(<4 x i64> %x0, <16 x i8>
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovusqb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x12,0xc2]
; X64-NEXT: vpmovusqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x12,0xc1]
-; X64-NEXT: vpmovusqb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x12,0xc0]
; X64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca]
+; X64-NEXT: vpmovusqb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x12,0xc0]
; X64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
@@ -1951,8 +1951,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_128(<2 x i64> %x0, <8 x i16> %
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovqw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x34,0xc1]
; X86-NEXT: vpmovqw %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x34,0xc2]
-; X86-NEXT: vpmovqw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x34,0xc0]
; X86-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca]
+; X86-NEXT: vpmovqw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x34,0xc0]
; X86-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1]
; X86-NEXT: retl # encoding: [0xc3]
;
@@ -1961,8 +1961,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_128(<2 x i64> %x0, <8 x i16> %
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovqw %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x34,0xc2]
; X64-NEXT: vpmovqw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x34,0xc1]
-; X64-NEXT: vpmovqw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x34,0xc0]
; X64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca]
+; X64-NEXT: vpmovqw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x34,0xc0]
; X64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1)
@@ -2005,8 +2005,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_128(<2 x i64> %x0, <8 x i16>
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovsqw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x24,0xc1]
; X86-NEXT: vpmovsqw %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x24,0xc2]
-; X86-NEXT: vpmovsqw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x24,0xc0]
; X86-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca]
+; X86-NEXT: vpmovsqw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x24,0xc0]
; X86-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1]
; X86-NEXT: retl # encoding: [0xc3]
;
@@ -2015,8 +2015,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_128(<2 x i64> %x0, <8 x i16>
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovsqw %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x24,0xc2]
; X64-NEXT: vpmovsqw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x24,0xc1]
-; X64-NEXT: vpmovsqw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x24,0xc0]
; X64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca]
+; X64-NEXT: vpmovsqw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x24,0xc0]
; X64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1)
@@ -2059,8 +2059,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_128(<2 x i64> %x0, <8 x i16>
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovusqw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x14,0xc1]
; X86-NEXT: vpmovusqw %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x14,0xc2]
-; X86-NEXT: vpmovusqw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x14,0xc0]
; X86-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca]
+; X86-NEXT: vpmovusqw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x14,0xc0]
; X86-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1]
; X86-NEXT: retl # encoding: [0xc3]
;
@@ -2069,8 +2069,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_128(<2 x i64> %x0, <8 x i16>
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovusqw %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x14,0xc2]
; X64-NEXT: vpmovusqw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x14,0xc1]
-; X64-NEXT: vpmovusqw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x14,0xc0]
; X64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca]
+; X64-NEXT: vpmovusqw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x14,0xc0]
; X64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1)
@@ -2113,8 +2113,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_256(<4 x i64> %x0, <8 x i16> %
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovqw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x34,0xc1]
; X86-NEXT: vpmovqw %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x34,0xc2]
-; X86-NEXT: vpmovqw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x34,0xc0]
; X86-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca]
+; X86-NEXT: vpmovqw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x34,0xc0]
; X86-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
@@ -2124,8 +2124,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_256(<4 x i64> %x0, <8 x i16> %
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovqw %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x34,0xc2]
; X64-NEXT: vpmovqw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x34,0xc1]
-; X64-NEXT: vpmovqw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x34,0xc0]
; X64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca]
+; X64-NEXT: vpmovqw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x34,0xc0]
; X64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
@@ -2171,8 +2171,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_256(<4 x i64> %x0, <8 x i16>
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovsqw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x24,0xc1]
; X86-NEXT: vpmovsqw %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x24,0xc2]
-; X86-NEXT: vpmovsqw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x24,0xc0]
; X86-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca]
+; X86-NEXT: vpmovsqw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x24,0xc0]
; X86-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
@@ -2182,8 +2182,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_256(<4 x i64> %x0, <8 x i16>
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovsqw %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x24,0xc2]
; X64-NEXT: vpmovsqw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x24,0xc1]
-; X64-NEXT: vpmovsqw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x24,0xc0]
; X64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca]
+; X64-NEXT: vpmovsqw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x24,0xc0]
; X64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
@@ -2229,8 +2229,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_256(<4 x i64> %x0, <8 x i16>
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovusqw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x14,0xc1]
; X86-NEXT: vpmovusqw %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x14,0xc2]
-; X86-NEXT: vpmovusqw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x14,0xc0]
; X86-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca]
+; X86-NEXT: vpmovusqw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x14,0xc0]
; X86-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
@@ -2240,8 +2240,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_256(<4 x i64> %x0, <8 x i16>
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovusqw %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x14,0xc2]
; X64-NEXT: vpmovusqw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x14,0xc1]
-; X64-NEXT: vpmovusqw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x14,0xc0]
; X64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca]
+; X64-NEXT: vpmovusqw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x14,0xc0]
; X64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
@@ -2622,8 +2622,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_db_128(<4 x i32> %x0, <16 x i8> %
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
; X86-NEXT: vpmovdb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x31,0xc2]
; X86-NEXT: vpmovdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x31,0xc1]
-; X86-NEXT: vpmovdb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x31,0xc0]
; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca]
+; X86-NEXT: vpmovdb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x31,0xc0]
; X86-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1]
; X86-NEXT: retl # encoding: [0xc3]
;
@@ -2632,8 +2632,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_db_128(<4 x i32> %x0, <16 x i8> %
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovdb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x31,0xc2]
; X64-NEXT: vpmovdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x31,0xc1]
-; X64-NEXT: vpmovdb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x31,0xc0]
; X64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca]
+; X64-NEXT: vpmovdb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x31,0xc0]
; X64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1)
@@ -2675,8 +2675,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_128(<4 x i32> %x0, <16 x i8>
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
; X86-NEXT: vpmovsdb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x21,0xc2]
; X86-NEXT: vpmovsdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x21,0xc1]
-; X86-NEXT: vpmovsdb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x21,0xc0]
; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca]
+; X86-NEXT: vpmovsdb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x21,0xc0]
; X86-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1]
; X86-NEXT: retl # encoding: [0xc3]
;
@@ -2685,8 +2685,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_128(<4 x i32> %x0, <16 x i8>
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovsdb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x21,0xc2]
; X64-NEXT: vpmovsdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x21,0xc1]
-; X64-NEXT: vpmovsdb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x21,0xc0]
; X64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca]
+; X64-NEXT: vpmovsdb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x21,0xc0]
; X64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1)
@@ -2728,8 +2728,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_128(<4 x i32> %x0, <16 x i8>
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
; X86-NEXT: vpmovusdb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x11,0xc2]
; X86-NEXT: vpmovusdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x11,0xc1]
-; X86-NEXT: vpmovusdb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x11,0xc0]
; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca]
+; X86-NEXT: vpmovusdb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x11,0xc0]
; X86-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1]
; X86-NEXT: retl # encoding: [0xc3]
;
@@ -2738,8 +2738,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_128(<4 x i32> %x0, <16 x i8>
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovusdb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x11,0xc2]
; X64-NEXT: vpmovusdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x11,0xc1]
-; X64-NEXT: vpmovusdb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x11,0xc0]
; X64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca]
+; X64-NEXT: vpmovusdb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x11,0xc0]
; X64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1)
@@ -2781,8 +2781,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_db_256(<8 x i32> %x0, <16 x i8> %
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
; X86-NEXT: vpmovdb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x31,0xc2]
; X86-NEXT: vpmovdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x31,0xc1]
-; X86-NEXT: vpmovdb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x31,0xc0]
; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca]
+; X86-NEXT: vpmovdb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x31,0xc0]
; X86-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
@@ -2792,8 +2792,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_db_256(<8 x i32> %x0, <16 x i8> %
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovdb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x31,0xc2]
; X64-NEXT: vpmovdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x31,0xc1]
-; X64-NEXT: vpmovdb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x31,0xc0]
; X64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca]
+; X64-NEXT: vpmovdb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x31,0xc0]
; X64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
@@ -2838,8 +2838,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_256(<8 x i32> %x0, <16 x i8>
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
; X86-NEXT: vpmovsdb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x21,0xc2]
; X86-NEXT: vpmovsdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x21,0xc1]
-; X86-NEXT: vpmovsdb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x21,0xc0]
; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca]
+; X86-NEXT: vpmovsdb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x21,0xc0]
; X86-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
@@ -2849,8 +2849,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_256(<8 x i32> %x0, <16 x i8>
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovsdb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x21,0xc2]
; X64-NEXT: vpmovsdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x21,0xc1]
-; X64-NEXT: vpmovsdb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x21,0xc0]
; X64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca]
+; X64-NEXT: vpmovsdb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x21,0xc0]
; X64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
@@ -2895,8 +2895,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_256(<8 x i32> %x0, <16 x i8>
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
; X86-NEXT: vpmovusdb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x11,0xc2]
; X86-NEXT: vpmovusdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x11,0xc1]
-; X86-NEXT: vpmovusdb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x11,0xc0]
; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca]
+; X86-NEXT: vpmovusdb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x11,0xc0]
; X86-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
@@ -2906,8 +2906,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_256(<8 x i32> %x0, <16 x i8>
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovusdb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x11,0xc2]
; X64-NEXT: vpmovusdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x11,0xc1]
-; X64-NEXT: vpmovusdb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x11,0xc0]
; X64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfc,0xca]
+; X64-NEXT: vpmovusdb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x11,0xc0]
; X64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfc,0xc1]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
@@ -2953,8 +2953,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_128(<4 x i32> %x0, <8 x i16> %
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovdw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x33,0xc1]
; X86-NEXT: vpmovdw %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x33,0xc2]
-; X86-NEXT: vpmovdw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x33,0xc0]
; X86-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca]
+; X86-NEXT: vpmovdw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x33,0xc0]
; X86-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1]
; X86-NEXT: retl # encoding: [0xc3]
;
@@ -2963,8 +2963,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_128(<4 x i32> %x0, <8 x i16> %
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovdw %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x33,0xc2]
; X64-NEXT: vpmovdw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x33,0xc1]
-; X64-NEXT: vpmovdw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x33,0xc0]
; X64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca]
+; X64-NEXT: vpmovdw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x33,0xc0]
; X64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1)
@@ -3007,8 +3007,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_128(<4 x i32> %x0, <8 x i16>
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovsdw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x23,0xc1]
; X86-NEXT: vpmovsdw %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x23,0xc2]
-; X86-NEXT: vpmovsdw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x23,0xc0]
; X86-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca]
+; X86-NEXT: vpmovsdw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x23,0xc0]
; X86-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1]
; X86-NEXT: retl # encoding: [0xc3]
;
@@ -3017,8 +3017,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_128(<4 x i32> %x0, <8 x i16>
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovsdw %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x23,0xc2]
; X64-NEXT: vpmovsdw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x23,0xc1]
-; X64-NEXT: vpmovsdw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x23,0xc0]
; X64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca]
+; X64-NEXT: vpmovsdw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x23,0xc0]
; X64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1)
@@ -3061,8 +3061,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_128(<4 x i32> %x0, <8 x i16>
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovusdw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x13,0xc1]
; X86-NEXT: vpmovusdw %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x13,0xc2]
-; X86-NEXT: vpmovusdw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x13,0xc0]
; X86-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca]
+; X86-NEXT: vpmovusdw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x13,0xc0]
; X86-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1]
; X86-NEXT: retl # encoding: [0xc3]
;
@@ -3071,8 +3071,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_128(<4 x i32> %x0, <8 x i16>
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovusdw %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x13,0xc2]
; X64-NEXT: vpmovusdw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x13,0xc1]
-; X64-NEXT: vpmovusdw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x13,0xc0]
; X64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca]
+; X64-NEXT: vpmovusdw %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x13,0xc0]
; X64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1)
@@ -3115,8 +3115,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_256(<8 x i32> %x0, <8 x i16> %
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovdw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x33,0xc1]
; X86-NEXT: vpmovdw %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x33,0xc2]
-; X86-NEXT: vpmovdw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x33,0xc0]
; X86-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca]
+; X86-NEXT: vpmovdw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x33,0xc0]
; X86-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
@@ -3126,8 +3126,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_256(<8 x i32> %x0, <8 x i16> %
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovdw %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x33,0xc2]
; X64-NEXT: vpmovdw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x33,0xc1]
-; X64-NEXT: vpmovdw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x33,0xc0]
; X64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca]
+; X64-NEXT: vpmovdw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x33,0xc0]
; X64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
@@ -3173,8 +3173,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_256(<8 x i32> %x0, <8 x i16>
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovsdw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x23,0xc1]
; X86-NEXT: vpmovsdw %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x23,0xc2]
-; X86-NEXT: vpmovsdw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x23,0xc0]
; X86-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca]
+; X86-NEXT: vpmovsdw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x23,0xc0]
; X86-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
@@ -3184,8 +3184,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_256(<8 x i32> %x0, <8 x i16>
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovsdw %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x23,0xc2]
; X64-NEXT: vpmovsdw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x23,0xc1]
-; X64-NEXT: vpmovsdw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x23,0xc0]
; X64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca]
+; X64-NEXT: vpmovsdw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x23,0xc0]
; X64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
@@ -3231,8 +3231,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_256(<8 x i32> %x0, <8 x i16>
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmovusdw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x13,0xc1]
; X86-NEXT: vpmovusdw %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x13,0xc2]
-; X86-NEXT: vpmovusdw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x13,0xc0]
; X86-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca]
+; X86-NEXT: vpmovusdw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x13,0xc0]
; X86-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
@@ -3242,8 +3242,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_256(<8 x i32> %x0, <8 x i16>
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpmovusdw %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x13,0xc2]
; X64-NEXT: vpmovusdw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x13,0xc1]
-; X64-NEXT: vpmovusdw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x13,0xc0]
; X64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xfd,0xca]
+; X64-NEXT: vpmovusdw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x13,0xc0]
; X64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
@@ -4296,8 +4296,8 @@ define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0, i8 %mask, <8 x i16> %s
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vcvtps2ph $2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x02]
; X86-NEXT: vcvtps2ph $2, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc2,0x02]
-; X86-NEXT: vcvtps2ph $2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x02]
; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
+; X86-NEXT: vcvtps2ph $2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x02]
; X86-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1]
; X86-NEXT: retl # encoding: [0xc3]
;
@@ -4306,8 +4306,8 @@ define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0, i8 %mask, <8 x i16> %s
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vcvtps2ph $2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x02]
; X64-NEXT: vcvtps2ph $2, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc2,0x02]
-; X64-NEXT: vcvtps2ph $2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x02]
; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
+; X64-NEXT: vcvtps2ph $2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x02]
; X64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1]
; X64-NEXT: retq # encoding: [0xc3]
%res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1)
@@ -4327,8 +4327,8 @@ define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0, i8 %mask, <8 x i16> %s
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vcvtps2ph $2, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x02]
; X86-NEXT: vcvtps2ph $2, %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc2,0x02]
-; X86-NEXT: vcvtps2ph $2, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x02]
; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
+; X86-NEXT: vcvtps2ph $2, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x02]
; X86-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
@@ -4338,8 +4338,8 @@ define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0, i8 %mask, <8 x i16> %s
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vcvtps2ph $2, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x02]
; X64-NEXT: vcvtps2ph $2, %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc2,0x02]
-; X64-NEXT: vcvtps2ph $2, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x02]
; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
+; X64-NEXT: vcvtps2ph $2, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x02]
; X64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
index eff8319bfe8..882814b83b7 100644
--- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll
+++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
@@ -107,11 +107,11 @@ define <4 x float> @test_buildvector_v4f32_load(float* %p0, float* %p1, float* %
; SSE2: # %bb.0:
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_buildvector_v4f32_load:
diff --git a/llvm/test/CodeGen/X86/domain-reassignment.mir b/llvm/test/CodeGen/X86/domain-reassignment.mir
index 4dffbbbd27d..6cf7fa6e3ac 100644
--- a/llvm/test/CodeGen/X86/domain-reassignment.mir
+++ b/llvm/test/CodeGen/X86/domain-reassignment.mir
@@ -76,8 +76,8 @@ registers:
- { id: 17, class: gr32, preferred-register: '' }
- { id: 18, class: vk1wm, preferred-register: '' }
- { id: 19, class: vr128x, preferred-register: '' }
- - { id: 20, class: fr128, preferred-register: '' }
- - { id: 21, class: fr128, preferred-register: '' }
+ - { id: 20, class: vr128, preferred-register: '' }
+ - { id: 21, class: vr128, preferred-register: '' }
- { id: 22, class: fr32x, preferred-register: '' }
liveins:
- { reg: '$edi', virtual-reg: '%3' }
@@ -141,8 +141,8 @@ body: |
; CHECK: [[COPY13:%[0-9]+]]:vk32 = COPY [[PHI]]
; CHECK: [[COPY14:%[0-9]+]]:vk1wm = COPY [[COPY13]]
; CHECK: [[COPY15:%[0-9]+]]:vr128x = COPY [[COPY4]]
- ; CHECK: [[DEF:%[0-9]+]]:fr128 = IMPLICIT_DEF
- ; CHECK: [[VMOVSSZrrk:%[0-9]+]]:fr128 = VMOVSSZrrk [[COPY15]], killed [[COPY14]], killed [[DEF]], [[COPY5]]
+ ; CHECK: [[DEF:%[0-9]+]]:vr128 = IMPLICIT_DEF
+ ; CHECK: [[VMOVSSZrrk:%[0-9]+]]:vr128 = VMOVSSZrrk [[COPY15]], killed [[COPY14]], killed [[DEF]], [[COPY5]]
; CHECK: [[COPY16:%[0-9]+]]:fr32x = COPY [[VMOVSSZrrk]]
; CHECK: VMOVSSZmr [[COPY6]], 1, $noreg, 0, $noreg, killed [[COPY16]] :: (store 4 into %ir.fptr)
; CHECK: RET 0
diff --git a/llvm/test/CodeGen/X86/haddsub-2.ll b/llvm/test/CodeGen/X86/haddsub-2.ll
index 42c6a2d57df..92a393b93bc 100644
--- a/llvm/test/CodeGen/X86/haddsub-2.ll
+++ b/llvm/test/CodeGen/X86/haddsub-2.ll
@@ -644,11 +644,11 @@ define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) {
; SSE3-NEXT: movd %xmm0, %eax
; SSE3-NEXT: pextrw $1, %xmm0, %ecx
; SSE3-NEXT: addl %eax, %ecx
-; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; SSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; SSE3-NEXT: pextrw $2, %xmm0, %eax
; SSE3-NEXT: pextrw $3, %xmm0, %ecx
; SSE3-NEXT: addl %eax, %ecx
-; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; SSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; SSE3-NEXT: pextrw $4, %xmm0, %eax
; SSE3-NEXT: pextrw $5, %xmm0, %r11d
; SSE3-NEXT: addl %eax, %r11d
@@ -697,9 +697,9 @@ define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) {
; SSE3-NEXT: movd %r13d, %xmm4
; SSE3-NEXT: movd %r15d, %xmm10
; SSE3-NEXT: movd %r11d, %xmm7
-; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm11 # 4-byte Folded Reload
+; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload
; SSE3-NEXT: # xmm11 = mem[0],zero,zero,zero
-; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm0 # 4-byte Folded Reload
+; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE3-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE3-NEXT: movd %eax, %xmm12
; SSE3-NEXT: movd %ecx, %xmm6
@@ -908,15 +908,15 @@ define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) {
; SSE-NEXT: subss %xmm3, %xmm2
; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
; SSE-NEXT: subss %xmm3, %xmm0
-; SSE-NEXT: movaps %xmm1, %xmm3
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,3]
-; SSE-NEXT: movaps %xmm1, %xmm4
-; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1]
-; SSE-NEXT: subss %xmm4, %xmm3
-; SSE-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSE-NEXT: subss %xmm4, %xmm1
-; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE-NEXT: movaps %xmm1, %xmm2
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3]
+; SSE-NEXT: movaps %xmm1, %xmm3
+; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1]
+; SSE-NEXT: subss %xmm3, %xmm2
+; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE-NEXT: subss %xmm3, %xmm1
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
@@ -927,14 +927,14 @@ define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) {
; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2
; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
; AVX-NEXT: vsubss %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
-; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
-; AVX-NEXT: vsubss %xmm4, %xmm3, %xmm3
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
-; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX-NEXT: vsubss %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
+; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; AVX-NEXT: vsubss %xmm3, %xmm1, %xmm1
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
; AVX-NEXT: retq
%vecext = extractelement <4 x float> %A, i32 2
%vecext1 = extractelement <4 x float> %A, i32 3
@@ -1289,11 +1289,11 @@ define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) {
; SSE3-NEXT: movd %xmm1, %eax
; SSE3-NEXT: pextrw $1, %xmm1, %ecx
; SSE3-NEXT: addl %eax, %ecx
-; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; SSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; SSE3-NEXT: pextrw $2, %xmm1, %eax
; SSE3-NEXT: pextrw $3, %xmm1, %ecx
; SSE3-NEXT: addl %eax, %ecx
-; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; SSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; SSE3-NEXT: pextrw $4, %xmm1, %eax
; SSE3-NEXT: pextrw $5, %xmm1, %r14d
; SSE3-NEXT: addl %eax, %r14d
@@ -1338,9 +1338,9 @@ define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) {
; SSE3-NEXT: movd %r9d, %xmm5
; SSE3-NEXT: movd %r15d, %xmm14
; SSE3-NEXT: movd %r14d, %xmm2
-; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm15 # 4-byte Folded Reload
+; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload
; SSE3-NEXT: # xmm15 = mem[0],zero,zero,zero
-; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm1 # 4-byte Folded Reload
+; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload
; SSE3-NEXT: # xmm1 = mem[0],zero,zero,zero
; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index 3cbfe03c804..2bdd537c651 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -388,10 +388,10 @@ define <4 x float> @test_extend32_vec4(<4 x half>* %p) #0 {
; CHECK-LIBCALL-NEXT: movq %rdi, %rbx
; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi
; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-LIBCALL-NEXT: movzwl 2(%rbx), %edi
; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-LIBCALL-NEXT: movzwl 4(%rbx), %edi
; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
; CHECK-LIBCALL-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
@@ -399,8 +399,8 @@ define <4 x float> @test_extend32_vec4(<4 x half>* %p) #0 {
; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
; CHECK-LIBCALL-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
; CHECK-LIBCALL-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; CHECK-LIBCALL-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; CHECK-LIBCALL-NEXT: unpcklps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-LIBCALL-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-LIBCALL-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-LIBCALL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; CHECK-LIBCALL-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-LIBCALL-NEXT: addq $48, %rsp
@@ -453,11 +453,11 @@ define <4 x float> @test_extend32_vec4(<4 x half>* %p) #0 {
; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp)
; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-I686-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-I686-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; CHECK-I686-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; CHECK-I686-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-I686-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-I686-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; CHECK-I686-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; CHECK-I686-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-I686-NEXT: addl $56, %esp
; CHECK-I686-NEXT: popl %esi
; CHECK-I686-NEXT: retl
diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
index c286c468710..b7c8820709a 100644
--- a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
+++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
@@ -288,9 +288,9 @@ define <4 x float> @merge_4f32_f32_012u(float* %ptr) nounwind uwtable noinline s
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32-SSE1-NEXT: retl
;
; X32-SSE41-LABEL: merge_4f32_f32_012u:
@@ -337,9 +337,9 @@ define <4 x float> @merge_4f32_f32_019u(float* %ptr) nounwind uwtable noinline s
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32-SSE1-NEXT: retl
;
; X32-SSE41-LABEL: merge_4f32_f32_019u:
@@ -1198,11 +1198,11 @@ define <4 x float> @merge_4f32_f32_2345_volatile(float* %ptr) nounwind uwtable n
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X32-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32-SSE1-NEXT: retl
;
; X32-SSE41-LABEL: merge_4f32_f32_2345_volatile:
diff --git a/llvm/test/CodeGen/X86/pr29112.ll b/llvm/test/CodeGen/X86/pr29112.ll
index 832ebd15b22..fa3c09873a9 100644
--- a/llvm/test/CodeGen/X86/pr29112.ll
+++ b/llvm/test/CodeGen/X86/pr29112.ll
@@ -10,7 +10,7 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, <
; CHECK: # %bb.0:
; CHECK-NEXT: subq $88, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 96
-; CHECK-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm1
; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm8
; CHECK-NEXT: vinsertps {{.*#+}} xmm9 = xmm8[0],xmm1[0],xmm8[2,3]
@@ -19,7 +19,7 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, <
; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0],xmm1[1],xmm8[2,3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm2[1],xmm4[3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm4[0,1,2],xmm3[1]
-; CHECK-NEXT: vmovaps %xmm6, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: vextractf32x4 $2, %zmm3, %xmm4
; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1,2],xmm4[3]
; CHECK-NEXT: vpermilps {{.*#+}} xmm5 = xmm2[3,1,2,3]
@@ -37,7 +37,8 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, <
; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0],xmm0[0],xmm8[2,3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1],xmm1[3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm14 = xmm1[0,1,2],xmm3[1]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm10 = xmm10[0,1,2],xmm3[1]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm10[0,1,2],xmm3[1]
+; CHECK-NEXT: vaddps %xmm14, %xmm1, %xmm10
; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[2,3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[1],xmm0[3]
@@ -49,20 +50,19 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, <
; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm12[0]
; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm2
; CHECK-NEXT: vmovaps %xmm15, %xmm1
-; CHECK-NEXT: vmovaps %xmm15, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: vaddps %xmm0, %xmm15, %xmm9
-; CHECK-NEXT: vaddps %xmm14, %xmm10, %xmm0
; CHECK-NEXT: vaddps %xmm15, %xmm15, %xmm8
-; CHECK-NEXT: vaddps %xmm11, %xmm3, %xmm3
-; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: vaddps %xmm11, %xmm3, %xmm0
+; CHECK-NEXT: vaddps %xmm10, %xmm0, %xmm0
; CHECK-NEXT: vaddps %xmm0, %xmm15, %xmm0
; CHECK-NEXT: vmovaps %xmm8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: vmovaps %xmm9, (%rsp)
-; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq foo
-; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vaddps {{[0-9]+}}(%rsp), %xmm1, %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
; CHECK-NEXT: addq $88, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
index de95fae1bfe..90e31eb5fb3 100644
--- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
@@ -2024,16 +2024,16 @@ define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) n
; X86-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x0c]
; X86-SSE-NEXT: # xmm1 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: movss {{[0-9]+}}(%esp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0x08]
-; X86-SSE-NEXT: # xmm2 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: movss {{[0-9]+}}(%esp), %xmm3 # encoding: [0xf3,0x0f,0x10,0x5c,0x24,0x04]
-; X86-SSE-NEXT: # xmm3 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: unpcklps %xmm3, %xmm2 # encoding: [0x0f,0x14,0xd3]
-; X86-SSE-NEXT: # xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X86-SSE-NEXT: unpcklps %xmm1, %xmm0 # encoding: [0x0f,0x14,0xc1]
; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-SSE-NEXT: movlhps %xmm2, %xmm0 # encoding: [0x0f,0x16,0xc2]
-; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm2[0]
+; X86-SSE-NEXT: movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x08]
+; X86-SSE-NEXT: # xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: movss {{[0-9]+}}(%esp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0x04]
+; X86-SSE-NEXT: # xmm2 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: unpcklps %xmm2, %xmm1 # encoding: [0x0f,0x14,0xca]
+; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X86-SSE-NEXT: movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
+; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm1[0]
; X86-SSE-NEXT: retl # encoding: [0xc3]
;
; X86-AVX1-LABEL: test_mm_set_ps:
@@ -2042,16 +2042,16 @@ define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) n
; X86-AVX1-NEXT: # xmm0 = mem[0],zero,zero,zero
; X86-AVX1-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
; X86-AVX1-NEXT: # xmm1 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vmovss {{[0-9]+}}(%esp), %xmm2 # encoding: [0xc5,0xfa,0x10,0x54,0x24,0x08]
-; X86-AVX1-NEXT: # xmm2 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vmovss {{[0-9]+}}(%esp), %xmm3 # encoding: [0xc5,0xfa,0x10,0x5c,0x24,0x04]
-; X86-AVX1-NEXT: # xmm3 = mem[0],zero,zero,zero
; X86-AVX1-NEXT: vinsertps $16, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x10]
; X86-AVX1-NEXT: # xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; X86-AVX1-NEXT: vinsertps $32, %xmm2, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc2,0x20]
-; X86-AVX1-NEXT: # xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
-; X86-AVX1-NEXT: vinsertps $48, %xmm3, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc3,0x30]
-; X86-AVX1-NEXT: # xmm0 = xmm0[0,1,2],xmm3[0]
+; X86-AVX1-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x08]
+; X86-AVX1-NEXT: # xmm1 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT: vinsertps $32, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x20]
+; X86-AVX1-NEXT: # xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; X86-AVX1-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
+; X86-AVX1-NEXT: # xmm1 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
+; X86-AVX1-NEXT: # xmm0 = xmm0[0,1,2],xmm1[0]
; X86-AVX1-NEXT: retl # encoding: [0xc3]
;
; X86-AVX512-LABEL: test_mm_set_ps:
@@ -2353,20 +2353,20 @@ define void @test_mm_setcsr(i32 %a0) nounwind {
define <4 x float> @test_mm_setr_ps(float %a0, float %a1, float %a2, float %a3) nounwind {
; X86-SSE-LABEL: test_mm_setr_ps:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x10]
+; X86-SSE-NEXT: movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x10]
+; X86-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x0c]
; X86-SSE-NEXT: # xmm1 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: movss {{[0-9]+}}(%esp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0x0c]
+; X86-SSE-NEXT: unpcklps %xmm0, %xmm1 # encoding: [0x0f,0x14,0xc8]
+; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X86-SSE-NEXT: movss {{[0-9]+}}(%esp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0x08]
; X86-SSE-NEXT: # xmm2 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: movss {{[0-9]+}}(%esp), %xmm3 # encoding: [0xf3,0x0f,0x10,0x5c,0x24,0x08]
-; X86-SSE-NEXT: # xmm3 = mem[0],zero,zero,zero
; X86-SSE-NEXT: movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x04]
; X86-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: unpcklps %xmm1, %xmm2 # encoding: [0x0f,0x14,0xd1]
-; X86-SSE-NEXT: # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-SSE-NEXT: unpcklps %xmm3, %xmm0 # encoding: [0x0f,0x14,0xc3]
-; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; X86-SSE-NEXT: movlhps %xmm2, %xmm0 # encoding: [0x0f,0x16,0xc2]
-; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm2[0]
+; X86-SSE-NEXT: unpcklps %xmm2, %xmm0 # encoding: [0x0f,0x14,0xc2]
+; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X86-SSE-NEXT: movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
+; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm1[0]
; X86-SSE-NEXT: retl # encoding: [0xc3]
;
; X86-AVX1-LABEL: test_mm_setr_ps:
diff --git a/llvm/test/CodeGen/X86/sse1.ll b/llvm/test/CodeGen/X86/sse1.ll
index 2859387de04..5e383a3bb08 100644
--- a/llvm/test/CodeGen/X86/sse1.ll
+++ b/llvm/test/CodeGen/X86/sse1.ll
@@ -57,8 +57,8 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
; X86-NEXT: jne .LBB1_8
; X86-NEXT: .LBB1_7:
; X86-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; X86-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; X86-NEXT: je .LBB1_10
; X86-NEXT: jmp .LBB1_11
; X86-NEXT: .LBB1_1:
@@ -71,8 +71,8 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
; X86-NEXT: je .LBB1_7
; X86-NEXT: .LBB1_8: # %entry
; X86-NEXT: xorps %xmm3, %xmm3
-; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; X86-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; X86-NEXT: jne .LBB1_11
; X86-NEXT: .LBB1_10:
; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -96,8 +96,8 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
; X64-NEXT: jne .LBB1_8
; X64-NEXT: .LBB1_7:
; X64-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; X64-NEXT: testl %esi, %esi
; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X64-NEXT: testl %esi, %esi
; X64-NEXT: je .LBB1_10
; X64-NEXT: jmp .LBB1_11
; X64-NEXT: .LBB1_1:
@@ -110,8 +110,8 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
; X64-NEXT: je .LBB1_7
; X64-NEXT: .LBB1_8: # %entry
; X64-NEXT: xorps %xmm3, %xmm3
-; X64-NEXT: testl %esi, %esi
; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X64-NEXT: testl %esi, %esi
; X64-NEXT: jne .LBB1_11
; X64-NEXT: .LBB1_10:
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
diff --git a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll
index e7d83dbdcdd..9f9fe237ad4 100644
--- a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll
+++ b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll
@@ -430,48 +430,31 @@ define <4 x float> @test16(<4 x float> %A, <4 x float> %B) {
; SSE-NEXT: subss %xmm5, %xmm4
; SSE-NEXT: movshdup {{.*#+}} xmm5 = xmm0[1,1,3,3]
; SSE-NEXT: addss %xmm3, %xmm5
+; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
; SSE-NEXT: addss %xmm0, %xmm1
; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0]
; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: retq
;
-; AVX1-LABEL: test16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX1-NEXT: vsubss %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0]
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
-; AVX1-NEXT: vsubss %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; AVX1-NEXT: vaddss %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX1-NEXT: retq
-;
-; AVX512-LABEL: test16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX512-NEXT: vsubss %xmm2, %xmm0, %xmm3
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0]
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
-; AVX512-NEXT: vsubss %xmm5, %xmm4, %xmm4
-; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; AVX512-NEXT: vaddss %xmm2, %xmm5, %xmm2
-; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
-; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
-; AVX512-NEXT: retq
+; AVX-LABEL: test16:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-NEXT: vsubss %xmm2, %xmm0, %xmm3
+; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0]
+; AVX-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
+; AVX-NEXT: vsubss %xmm5, %xmm4, %xmm4
+; AVX-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; AVX-NEXT: vaddss %xmm2, %xmm5, %xmm2
+; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
+; AVX-NEXT: retq
%1 = extractelement <4 x float> %A, i32 0
%2 = extractelement <4 x float> %B, i32 0
%sub = fsub float %1, 42.0
@@ -644,34 +627,34 @@ define <8 x double> @test18(<8 x double> %A, <8 x double> %B) {
;
; AVX512-LABEL: test18:
; AVX512: # %bb.0:
-; AVX512-NEXT: vsubsd %xmm1, %xmm0, %xmm8
+; AVX512-NEXT: vsubsd %xmm1, %xmm0, %xmm2
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX512-NEXT: vsubsd %xmm4, %xmm3, %xmm9
+; AVX512-NEXT: vsubsd %xmm4, %xmm3, %xmm5
; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm1[1,0]
; AVX512-NEXT: vaddsd %xmm7, %xmm6, %xmm6
+; AVX512-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm6[0]
; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
; AVX512-NEXT: vaddsd %xmm4, %xmm3, %xmm3
+; AVX512-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm5[0],xmm3[0]
; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm4
-; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm7
-; AVX512-NEXT: vsubsd %xmm7, %xmm4, %xmm2
+; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm5
+; AVX512-NEXT: vsubsd %xmm5, %xmm4, %xmm6
; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1
-; AVX512-NEXT: vsubsd %xmm1, %xmm0, %xmm5
+; AVX512-NEXT: vsubsd %xmm1, %xmm0, %xmm7
; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm7[1,0]
-; AVX512-NEXT: vaddsd %xmm7, %xmm4, %xmm4
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512-NEXT: vaddsd %xmm5, %xmm4, %xmm4
+; AVX512-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm6[0],xmm4[0]
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm5[0],xmm0[0]
-; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm4[0]
-; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm9[0],xmm3[0]
-; AVX512-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm8[0],xmm6[0]
-; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm7[0],xmm0[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
+; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512-NEXT: retq
%1 = extractelement <8 x double> %A, i32 0
diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll
index 661a4785974..39ccc5eec96 100644
--- a/llvm/test/CodeGen/X86/var-permute-128.ll
+++ b/llvm/test/CodeGen/X86/var-permute-128.ll
@@ -439,13 +439,13 @@ define <4 x float> @var_shuffle_v4f32(<4 x float> %v, <4 x i32> %indices) nounwi
; SSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE3-NEXT: andl $3, %ecx
; SSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE3-NEXT: andl $3, %edx
-; SSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE3-NEXT: andl $3, %esi
-; SSE3-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SSE3-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: var_shuffle_v4f32:
@@ -655,15 +655,15 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; SSE3-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
+; SSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; SSE3-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
+; SSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; SSE3-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
+; SSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
@@ -742,15 +742,15 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in
; SSE3-NEXT: andl $31, %r11d
; SSE3-NEXT: movzbl 448(%rsp,%r11), %eax
; SSE3-NEXT: movd %eax, %xmm14
-; SSE3-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; SSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSE3-NEXT: andl $31, %eax
; SSE3-NEXT: movzbl 480(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm1
-; SSE3-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; SSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSE3-NEXT: andl $31, %eax
; SSE3-NEXT: movzbl 512(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm2
-; SSE3-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; SSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSE3-NEXT: andl $31, %eax
; SSE3-NEXT: movzbl 544(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm0
@@ -793,15 +793,15 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
+; SSSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
+; SSSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
+; SSSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
@@ -880,15 +880,15 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in
; SSSE3-NEXT: andl $31, %r11d
; SSSE3-NEXT: movzbl 448(%rsp,%r11), %eax
; SSSE3-NEXT: movd %eax, %xmm14
-; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; SSSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSSE3-NEXT: andl $31, %eax
; SSSE3-NEXT: movzbl 480(%rsp,%rax), %eax
; SSSE3-NEXT: movd %eax, %xmm1
-; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; SSSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSSE3-NEXT: andl $31, %eax
; SSSE3-NEXT: movzbl 512(%rsp,%rax), %eax
; SSSE3-NEXT: movd %eax, %xmm2
-; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; SSSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSSE3-NEXT: andl $31, %eax
; SSSE3-NEXT: movzbl 544(%rsp,%rax), %eax
; SSSE3-NEXT: movd %eax, %xmm0
diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
index 38dcce873e3..cc92f2112ac 100644
--- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
@@ -4210,7 +4210,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB80_4
; AVX1-NEXT: # %bb.5:
-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm4
; AVX1-NEXT: jmp .LBB80_6
; AVX1-NEXT: .LBB80_4:
; AVX1-NEXT: movq %rax, %rcx
@@ -4218,22 +4218,22 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX1-NEXT: andl $1, %eax
; AVX1-NEXT: orq %rcx, %rax
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
-; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm4
; AVX1-NEXT: .LBB80_6:
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX1-NEXT: vmovq %xmm2, %rax
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB80_7
; AVX1-NEXT: # %bb.8:
-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3
; AVX1-NEXT: jmp .LBB80_9
; AVX1-NEXT: .LBB80_7:
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: shrq %rcx
; AVX1-NEXT: andl $1, %eax
; AVX1-NEXT: orq %rcx, %rax
-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4
-; AVX1-NEXT: vaddss %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3
+; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3
; AVX1-NEXT: .LBB80_9:
; AVX1-NEXT: vpextrq $1, %xmm2, %rax
; AVX1-NEXT: testq %rax, %rax
@@ -4263,29 +4263,29 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5
; AVX1-NEXT: vaddss %xmm5, %xmm5, %xmm5
; AVX1-NEXT: .LBB80_15:
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[2,3]
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB80_16
; AVX1-NEXT: # %bb.17:
-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4
; AVX1-NEXT: jmp .LBB80_18
; AVX1-NEXT: .LBB80_16:
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: shrq %rcx
; AVX1-NEXT: andl $1, %eax
; AVX1-NEXT: orq %rcx, %rax
-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3
-; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4
+; AVX1-NEXT: vaddss %xmm4, %xmm4, %xmm4
; AVX1-NEXT: .LBB80_18:
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vmovq %xmm4, %rax
+; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vmovq %xmm3, %rax
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB80_19
; AVX1-NEXT: # %bb.20:
-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0
; AVX1-NEXT: jmp .LBB80_21
; AVX1-NEXT: .LBB80_19:
; AVX1-NEXT: movq %rax, %rcx
@@ -4293,25 +4293,25 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX1-NEXT: andl $1, %eax
; AVX1-NEXT: orq %rcx, %rax
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0
-; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm5
+; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0
; AVX1-NEXT: .LBB80_21:
+; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0],xmm4[3]
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3]
-; AVX1-NEXT: vpextrq $1, %xmm4, %rax
+; AVX1-NEXT: vpextrq $1, %xmm3, %rax
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB80_22
; AVX1-NEXT: # %bb.23:
-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm1
; AVX1-NEXT: jmp .LBB80_24
; AVX1-NEXT: .LBB80_22:
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: shrq %rcx
; AVX1-NEXT: andl $1, %eax
; AVX1-NEXT: orq %rcx, %rax
-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2
-; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm1
+; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1
; AVX1-NEXT: .LBB80_24:
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -4337,7 +4337,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB80_4
; AVX2-NEXT: # %bb.5:
-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm4
; AVX2-NEXT: jmp .LBB80_6
; AVX2-NEXT: .LBB80_4:
; AVX2-NEXT: movq %rax, %rcx
@@ -4345,22 +4345,22 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX2-NEXT: andl $1, %eax
; AVX2-NEXT: orq %rcx, %rax
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
-; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm4
; AVX2-NEXT: .LBB80_6:
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
; AVX2-NEXT: vmovq %xmm2, %rax
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB80_7
; AVX2-NEXT: # %bb.8:
-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3
; AVX2-NEXT: jmp .LBB80_9
; AVX2-NEXT: .LBB80_7:
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: shrq %rcx
; AVX2-NEXT: andl $1, %eax
; AVX2-NEXT: orq %rcx, %rax
-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4
-; AVX2-NEXT: vaddss %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3
+; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3
; AVX2-NEXT: .LBB80_9:
; AVX2-NEXT: vpextrq $1, %xmm2, %rax
; AVX2-NEXT: testq %rax, %rax
@@ -4390,29 +4390,29 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5
; AVX2-NEXT: vaddss %xmm5, %xmm5, %xmm5
; AVX2-NEXT: .LBB80_15:
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[2,3]
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB80_16
; AVX2-NEXT: # %bb.17:
-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4
; AVX2-NEXT: jmp .LBB80_18
; AVX2-NEXT: .LBB80_16:
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: shrq %rcx
; AVX2-NEXT: andl $1, %eax
; AVX2-NEXT: orq %rcx, %rax
-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3
-; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4
+; AVX2-NEXT: vaddss %xmm4, %xmm4, %xmm4
; AVX2-NEXT: .LBB80_18:
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT: vmovq %xmm4, %rax
+; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT: vmovq %xmm3, %rax
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB80_19
; AVX2-NEXT: # %bb.20:
-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0
; AVX2-NEXT: jmp .LBB80_21
; AVX2-NEXT: .LBB80_19:
; AVX2-NEXT: movq %rax, %rcx
@@ -4420,25 +4420,25 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX2-NEXT: andl $1, %eax
; AVX2-NEXT: orq %rcx, %rax
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0
-; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm5
+; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0
; AVX2-NEXT: .LBB80_21:
+; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0],xmm4[3]
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3]
-; AVX2-NEXT: vpextrq $1, %xmm4, %rax
+; AVX2-NEXT: vpextrq $1, %xmm3, %rax
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB80_22
; AVX2-NEXT: # %bb.23:
-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm1
; AVX2-NEXT: jmp .LBB80_24
; AVX2-NEXT: .LBB80_22:
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: shrq %rcx
; AVX2-NEXT: andl $1, %eax
; AVX2-NEXT: orq %rcx, %rax
-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2
-; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm1
+; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1
; AVX2-NEXT: .LBB80_24:
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[0]
; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll
index f13c048afab..12437680999 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll
@@ -77,13 +77,13 @@ define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i3
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: andl $3, %esi
; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: andl $3, %edx
-; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: andl $3, %ecx
-; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
@@ -97,13 +97,13 @@ define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i3
; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: andl $3, %esi
; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: andl $3, %edx
-; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSSE3-NEXT: andl $3, %ecx
-; SSSE3-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SSSE3-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
diff --git a/llvm/test/CodeGen/X86/vector-sqrt.ll b/llvm/test/CodeGen/X86/vector-sqrt.ll
index 8eb22d3b152..87ae53947d0 100644
--- a/llvm/test/CodeGen/X86/vector-sqrt.ll
+++ b/llvm/test/CodeGen/X86/vector-sqrt.ll
@@ -33,13 +33,13 @@ define <4 x float> @sqrtf4(float* nocapture readonly %v) local_unnamed_addr #0 {
; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vsqrtss %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; CHECK-NEXT: vsqrtss %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; CHECK-NEXT: vsqrtss %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
+; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: vsqrtss %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: vsqrtss %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; CHECK-NEXT: retq
entry:
%0 = load float, float* %v, align 4
OpenPOWER on IntegriCloud