summaryrefslogtreecommitdiffstats
path: root/llvm/test/CodeGen
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen')
-rw-r--r--llvm/test/CodeGen/X86/avx512-arith.ll30
-rw-r--r--llvm/test/CodeGen/X86/avx512-bugfix-25270.ll2
-rw-r--r--llvm/test/CodeGen/X86/avx512-bugfix-26264.ll16
-rw-r--r--llvm/test/CodeGen/X86/avx512-ext.ll8
-rw-r--r--llvm/test/CodeGen/X86/avx512-fma-intrinsics.ll22
-rw-r--r--llvm/test/CodeGen/X86/avx512-insert-extract.ll4
-rw-r--r--llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll34
-rw-r--r--llvm/test/CodeGen/X86/avx512-intrinsics.ll146
-rw-r--r--llvm/test/CodeGen/X86/avx512-logic.ll16
-rw-r--r--llvm/test/CodeGen/X86/avx512-mov.ll8
-rw-r--r--llvm/test/CodeGen/X86/avx512-select.ll2
-rw-r--r--llvm/test/CodeGen/X86/avx512-vbroadcast.ll4
-rw-r--r--llvm/test/CodeGen/X86/avx512bw-intrinsics.ll84
-rw-r--r--llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll12
-rw-r--r--llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll6
-rw-r--r--llvm/test/CodeGen/X86/avx512vbmi-intrinsics.ll57
-rw-r--r--llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll8
-rw-r--r--llvm/test/CodeGen/X86/avx512vl-intrinsics.ll4
-rw-r--r--llvm/test/CodeGen/X86/fma_patterns_wide.ll4
-rw-r--r--llvm/test/CodeGen/X86/masked_gather_scatter.ll88
-rw-r--r--llvm/test/CodeGen/X86/masked_memop.ll44
-rw-r--r--llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll12
-rw-r--r--llvm/test/CodeGen/X86/nontemporal-2.ll22
-rw-r--r--llvm/test/CodeGen/X86/nontemporal-loads.ll2
-rw-r--r--llvm/test/CodeGen/X86/pmul.ll2
-rw-r--r--llvm/test/CodeGen/X86/scalar-int-to-fp.ll6
-rw-r--r--llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll12
-rw-r--r--llvm/test/CodeGen/X86/vector-half-conversions.ll16
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll2
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll52
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll18
-rw-r--r--llvm/test/CodeGen/X86/vector-trunc-math.ll36
32 files changed, 381 insertions, 398 deletions
diff --git a/llvm/test/CodeGen/X86/avx512-arith.ll b/llvm/test/CodeGen/X86/avx512-arith.ll
index 62dece137cc..5e71fffff02 100644
--- a/llvm/test/CodeGen/X86/avx512-arith.ll
+++ b/llvm/test/CodeGen/X86/avx512-arith.ll
@@ -891,7 +891,7 @@ define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double>
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
; CHECK-NEXT: vpcmpneqq %zmm0, %zmm2, %k1
; CHECK-NEXT: vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
; CHECK-NEXT: retq
double* %j, <8 x i64> %mask1) nounwind {
%mask = icmp ne <8 x i64> %mask1, zeroinitializer
@@ -962,30 +962,10 @@ define <8 x float> @test_fxor_8f32(<8 x float> %a) {
}
define <8 x double> @fabs_v8f64(<8 x double> %p)
-; AVX512F-LABEL: fabs_v8f64:
-; AVX512F: ## BB#0:
-; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fabs_v8f64:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: fabs_v8f64:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: fabs_v8f64:
-; AVX512DQ: ## BB#0:
-; AVX512DQ-NEXT: vandpd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512DQ-NEXT: retq
-;
-; SKX-LABEL: fabs_v8f64:
-; SKX: ## BB#0:
-; SKX-NEXT: vandpd {{.*}}(%rip), %zmm0, %zmm0
-; SKX-NEXT: retq
+; CHECK-LABEL: fabs_v8f64:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandps {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT: retq
{
%t = call <8 x double> @llvm.fabs.v8f64(<8 x double> %p)
ret <8 x double> %t
diff --git a/llvm/test/CodeGen/X86/avx512-bugfix-25270.ll b/llvm/test/CodeGen/X86/avx512-bugfix-25270.ll
index d024475274b..1cf1c076796 100644
--- a/llvm/test/CodeGen/X86/avx512-bugfix-25270.ll
+++ b/llvm/test/CodeGen/X86/avx512-bugfix-25270.ll
@@ -10,7 +10,7 @@ define void @bar__512(<16 x i32>* %var) #0 {
; CHECK-NEXT: subq $112, %rsp
; CHECK-NEXT: movq %rdi, %rbx
; CHECK-NEXT: vmovdqu32 (%rbx), %zmm0
-; CHECK-NEXT: vmovups %zmm0, (%rsp) ## 64-byte Spill
+; CHECK-NEXT: vmovdqu64 %zmm0, (%rsp) ## 64-byte Spill
; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %zmm1
; CHECK-NEXT: vmovdqa32 %zmm1, (%rbx)
; CHECK-NEXT: callq _Print__512
diff --git a/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll b/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll
index b3e1b17076b..b15d28a649b 100644
--- a/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll
+++ b/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll
@@ -13,10 +13,10 @@ define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32
; AVX512BW-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
; AVX512BW-NEXT: kshiftrw $8, %k2, %k1
; AVX512BW-NEXT: vmovupd 192(%rdi), %zmm4 {%k1}
-; AVX512BW-NEXT: vmovaps %zmm1, %zmm0
-; AVX512BW-NEXT: vmovaps %zmm2, %zmm1
-; AVX512BW-NEXT: vmovaps %zmm3, %zmm2
-; AVX512BW-NEXT: vmovaps %zmm4, %zmm3
+; AVX512BW-NEXT: vmovapd %zmm1, %zmm0
+; AVX512BW-NEXT: vmovapd %zmm2, %zmm1
+; AVX512BW-NEXT: vmovapd %zmm3, %zmm2
+; AVX512BW-NEXT: vmovapd %zmm4, %zmm3
; AVX512BW-NEXT: retq
%res = call <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0)
ret <32 x double> %res
@@ -34,10 +34,10 @@ define <32 x i64> @test_load_32i64(<32 x i64>* %ptrs, <32 x i1> %mask, <32 x i64
; AVX512BW-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1}
; AVX512BW-NEXT: kshiftrw $8, %k2, %k1
; AVX512BW-NEXT: vmovdqu64 192(%rdi), %zmm4 {%k1}
-; AVX512BW-NEXT: vmovaps %zmm1, %zmm0
-; AVX512BW-NEXT: vmovaps %zmm2, %zmm1
-; AVX512BW-NEXT: vmovaps %zmm3, %zmm2
-; AVX512BW-NEXT: vmovaps %zmm4, %zmm3
+; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2
+; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3
; AVX512BW-NEXT: retq
%res = call <32 x i64> @llvm.masked.load.v32i64.p0v32i64(<32 x i64>* %ptrs, i32 4, <32 x i1> %mask, <32 x i64> %src0)
ret <32 x i64> %res
diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll
index faac7b20fd6..e39f303fef9 100644
--- a/llvm/test/CodeGen/X86/avx512-ext.ll
+++ b/llvm/test/CodeGen/X86/avx512-ext.ll
@@ -163,7 +163,7 @@ define <32 x i16> @zext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwi
; KNL-NEXT: vpsllw $15, %ymm0, %ymm0
; KNL-NEXT: vpsraw $15, %ymm0, %ymm0
; KNL-NEXT: vpand %ymm1, %ymm0, %ymm1
-; KNL-NEXT: vmovaps %zmm2, %zmm0
+; KNL-NEXT: vmovdqa64 %zmm2, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: zext_32x8mem_to_32x16:
@@ -192,7 +192,7 @@ define <32 x i16> @sext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwi
; KNL-NEXT: vpsllw $15, %ymm0, %ymm0
; KNL-NEXT: vpsraw $15, %ymm0, %ymm0
; KNL-NEXT: vpand %ymm1, %ymm0, %ymm1
-; KNL-NEXT: vmovaps %zmm2, %zmm0
+; KNL-NEXT: vmovdqa64 %zmm2, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: sext_32x8mem_to_32x16:
@@ -213,7 +213,7 @@ define <32 x i16> @zext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone {
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; KNL-NEXT: vmovaps %zmm2, %zmm0
+; KNL-NEXT: vmovdqa64 %zmm2, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: zext_32x8_to_32x16:
@@ -258,7 +258,7 @@ define <32 x i16> @sext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone {
; KNL-NEXT: vpmovsxbw %xmm0, %ymm2
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovsxbw %xmm0, %ymm1
-; KNL-NEXT: vmovaps %zmm2, %zmm0
+; KNL-NEXT: vmovdqa64 %zmm2, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: sext_32x8_to_32x16:
diff --git a/llvm/test/CodeGen/X86/avx512-fma-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-fma-intrinsics.ll
index d8026cd987c..f52430d2f22 100644
--- a/llvm/test/CodeGen/X86/avx512-fma-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512-fma-intrinsics.ll
@@ -129,7 +129,7 @@ define <8 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_512(<8 x double> %x0,
; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vmovapd %zmm0, %zmm3
; CHECK-NEXT: vfmaddsub213pd %zmm2, %zmm1, %zmm3 {%k1}
; CHECK-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0
@@ -146,7 +146,7 @@ define <8 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_512(<8 x double> %x0,
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vmovapd %zmm2, %zmm3
; CHECK-NEXT: vfmaddsub231pd %zmm1, %zmm0, %zmm3 {%k1}
; CHECK-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0
@@ -163,7 +163,7 @@ define <8 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_512(<8 x double> %x0,
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vmovapd %zmm0, %zmm3
; CHECK-NEXT: vfmaddsub213pd %zmm2, %zmm1, %zmm3 {%k1} {z}
; CHECK-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0
@@ -229,7 +229,7 @@ define <8 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_512(<8 x double> %x0,
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vmovapd %zmm2, %zmm3
; CHECK-NEXT: vfmsubadd231pd %zmm1, %zmm0, %zmm3 {%k1}
; CHECK-NEXT: vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0
@@ -358,7 +358,7 @@ define <8 x double>@test_int_x86_avx512_mask3_vfmsub_pd_512(<8 x double> %x0, <8
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vmovapd %zmm2, %zmm3
; CHECK-NEXT: vfmsub231pd %zmm1, %zmm0, %zmm3 {%k1}
; CHECK-NEXT: vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0
@@ -485,7 +485,7 @@ define <8 x double>@test_int_x86_avx512_mask_vfmadd_pd_512(<8 x double> %x0, <8
; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vmovapd %zmm0, %zmm3
; CHECK-NEXT: vfmadd213pd %zmm2, %zmm1, %zmm3 {%k1}
; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0
@@ -502,7 +502,7 @@ define <8 x double>@test_int_x86_avx512_mask3_vfmadd_pd_512(<8 x double> %x0, <8
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vmovapd %zmm2, %zmm3
; CHECK-NEXT: vfmadd231pd %zmm1, %zmm0, %zmm3 {%k1}
; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0
@@ -519,7 +519,7 @@ define <8 x double>@test_int_x86_avx512_maskz_vfmadd_pd_512(<8 x double> %x0, <8
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vmovapd %zmm0, %zmm3
; CHECK-NEXT: vfmadd213pd %zmm2, %zmm1, %zmm3 {%k1} {z}
; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0
@@ -679,7 +679,7 @@ define <8 x double>@test_int_x86_avx512_mask_vfnmsub_pd_512(<8 x double> %x0, <8
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vmovapd %zmm0, %zmm3
; CHECK-NEXT: vfnmsub213pd %zmm2, %zmm1, %zmm3 {%k1}
; CHECK-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0
@@ -696,7 +696,7 @@ define <8 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_512(<8 x double> %x0, <
; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vmovapd %zmm2, %zmm3
; CHECK-NEXT: vfnmsub231pd %zmm1, %zmm0, %zmm3 {%k1}
; CHECK-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0
@@ -743,7 +743,7 @@ define <8 x double>@test_int_x86_avx512_mask_vfnmadd_pd_512(<8 x double> %x0, <8
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vmovapd %zmm0, %zmm3
; CHECK-NEXT: vfnmadd213pd %zmm2, %zmm1, %zmm3 {%k1}
; CHECK-NEXT: vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0
diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
index 2c42aca33e4..21de1aca43e 100644
--- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
@@ -204,7 +204,7 @@ define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) {
; KNL-NEXT: testb %al, %al
; KNL-NEXT: je LBB10_2
; KNL-NEXT: ## BB#1: ## %A
-; KNL-NEXT: vmovaps %zmm1, %zmm0
+; KNL-NEXT: vmovdqa64 %zmm1, %zmm0
; KNL-NEXT: retq
; KNL-NEXT: LBB10_2: ## %B
; KNL-NEXT: vpaddd %zmm0, %zmm1, %zmm0
@@ -219,7 +219,7 @@ define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) {
; SKX-NEXT: testb %al, %al
; SKX-NEXT: je LBB10_2
; SKX-NEXT: ## BB#1: ## %A
-; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0
; SKX-NEXT: retq
; SKX-NEXT: LBB10_2: ## %B
; SKX-NEXT: vpaddd %zmm0, %zmm1, %zmm0
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
index 7d0535546df..22a07c07341 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -745,7 +745,7 @@ define <16 x i32> @test_x86_avx512_mask_pslli_d(<16 x i32> %a0, <16 x i32> %a1,
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpslld $7, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
ret <16 x i32> %res
@@ -777,7 +777,7 @@ define <8 x i64> @test_x86_avx512_mask_pslli_q(<8 x i64> %a0, <8 x i64> %a1, i8
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsllq $7, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
ret <8 x i64> %res
@@ -809,7 +809,7 @@ define <16 x i32> @test_x86_avx512_mask_psrli_d(<16 x i32> %a0, <16 x i32> %a1,
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrld $7, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
ret <16 x i32> %res
@@ -841,7 +841,7 @@ define <8 x i64> @test_x86_avx512_mask_psrli_q(<8 x i64> %a0, <8 x i64> %a1, i8
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
ret <8 x i64> %res
@@ -873,7 +873,7 @@ define <16 x i32> @test_x86_avx512_mask_psrai_d(<16 x i32> %a0, <16 x i32> %a1,
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrad $7, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
ret <16 x i32> %res
@@ -905,7 +905,7 @@ define <8 x i64> @test_x86_avx512_mask_psrai_q(<8 x i64> %a0, <8 x i64> %a1, i8
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsraq $7, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
ret <8 x i64> %res
@@ -928,7 +928,7 @@ declare void @llvm.x86.avx512.storent.q.512(i8*, <8 x i64>)
define void@test_storent_q_512(<8 x i64> %data, i8* %ptr) {
; CHECK-LABEL: test_storent_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: vmovntdq %zmm0, (%rdi)
+; CHECK-NEXT: vmovntps %zmm0, (%rdi)
; CHECK-NEXT: retq
call void @llvm.x86.avx512.storent.q.512(i8* %ptr, <8 x i64> %data)
ret void
@@ -939,7 +939,7 @@ declare void @llvm.x86.avx512.storent.pd.512(i8*, <8 x double>)
define void @test_storent_pd_512(<8 x double> %data, i8* %ptr) {
; CHECK-LABEL: test_storent_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: vmovntpd %zmm0, (%rdi)
+; CHECK-NEXT: vmovntps %zmm0, (%rdi)
; CHECK-NEXT: retq
call void @llvm.x86.avx512.storent.pd.512(i8* %ptr, <8 x double> %data)
ret void
@@ -970,7 +970,7 @@ define <16 x i32> @test_mask_xor_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpxord %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
ret < 16 x i32> %res
@@ -992,7 +992,7 @@ define <16 x i32> @test_mask_or_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %p
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpord %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
ret < 16 x i32> %res
@@ -1014,7 +1014,7 @@ define <16 x i32> @test_mask_and_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpandd %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
ret < 16 x i32> %res
@@ -1025,7 +1025,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32>, <16 x i32>, <16
define <8 x i64> @test_xor_epi64(<8 x i64> %a, <8 x i64> %b) {
; CHECK-LABEL: test_xor_epi64:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vxorps %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
ret < 8 x i64> %res
@@ -1036,7 +1036,7 @@ define <8 x i64> @test_mask_xor_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %pass
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
ret < 8 x i64> %res
@@ -1047,7 +1047,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64>, <8 x i64>, <8 x i6
define <8 x i64> @test_or_epi64(<8 x i64> %a, <8 x i64> %b) {
; CHECK-LABEL: test_or_epi64:
; CHECK: ## BB#0:
-; CHECK-NEXT: vporq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vorps %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
ret < 8 x i64> %res
@@ -1058,7 +1058,7 @@ define <8 x i64> @test_mask_or_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passT
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vporq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
ret < 8 x i64> %res
@@ -1069,7 +1069,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64>, <8 x i64>, <8 x i64
define <8 x i64> @test_and_epi64(<8 x i64> %a, <8 x i64> %b) {
; CHECK-LABEL: test_and_epi64:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vandps %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
ret < 8 x i64> %res
@@ -1080,7 +1080,7 @@ define <8 x i64> @test_mask_and_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %pass
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
ret < 8 x i64> %res
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll
index 65ed7737438..cab1aae1142 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll
@@ -679,7 +679,7 @@ define <8 x i64> @test_mask_conflict_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpconflictq %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
ret <8 x i64> %res
@@ -713,7 +713,7 @@ define <16 x i32> @test_mask_lzcnt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vplzcntd %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
ret <16 x i32> %res
@@ -724,7 +724,7 @@ define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vplzcntq %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
ret <8 x i64> %res
@@ -861,7 +861,7 @@ define <8 x i64> @test_mask_valign_q(<8 x i64> %a, <8 x i64> %b, <8 x i64> %src,
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: valignq $2, %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i32 2, <8 x i64> %src, i8 %mask)
ret <8 x i64> %res
@@ -1342,7 +1342,7 @@ define <16 x i32> @test_x86_avx512_mask_psll_d(<16 x i32> %a0, <4 x i32> %a1, <1
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpslld %xmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)
ret <16 x i32> %res
@@ -1374,7 +1374,7 @@ define <8 x i64> @test_x86_avx512_mask_psll_q(<8 x i64> %a0, <2 x i64> %a1, <8 x
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsllq %xmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)
ret <8 x i64> %res
@@ -1406,7 +1406,7 @@ define <16 x i32> @test_x86_avx512_mask_psrl_d(<16 x i32> %a0, <4 x i32> %a1, <1
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrld %xmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)
ret <16 x i32> %res
@@ -1438,7 +1438,7 @@ define <8 x i64> @test_x86_avx512_mask_psrl_q(<8 x i64> %a0, <2 x i64> %a1, <8 x
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrlq %xmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)
ret <8 x i64> %res
@@ -1470,7 +1470,7 @@ define <16 x i32> @test_x86_avx512_mask_psra_d(<16 x i32> %a0, <4 x i32> %a1, <1
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrad %xmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)
ret <16 x i32> %res
@@ -1502,7 +1502,7 @@ define <8 x i64> @test_x86_avx512_mask_psra_q(<8 x i64> %a0, <2 x i64> %a1, <8 x
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsraq %xmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)
ret <8 x i64> %res
@@ -1534,7 +1534,7 @@ define <16 x i32> @test_x86_avx512_mask_psllv_d(<16 x i32> %a0, <16 x i32> %a1,
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsllvd %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)
ret <16 x i32> %res
@@ -1566,7 +1566,7 @@ define <8 x i64> @test_x86_avx512_mask_psllv_q(<8 x i64> %a0, <8 x i64> %a1, <8
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsllvq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)
ret <8 x i64> %res
@@ -1599,7 +1599,7 @@ define <16 x i32> @test_x86_avx512_mask_psrav_d(<16 x i32> %a0, <16 x i32> %a1,
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsravd %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)
ret <16 x i32> %res
@@ -1631,7 +1631,7 @@ define <8 x i64> @test_x86_avx512_mask_psrav_q(<8 x i64> %a0, <8 x i64> %a1, <8
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsravq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)
ret <8 x i64> %res
@@ -1663,7 +1663,7 @@ define <16 x i32> @test_x86_avx512_mask_psrlv_d(<16 x i32> %a0, <16 x i32> %a1,
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrlvd %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)
ret <16 x i32> %res
@@ -1695,7 +1695,7 @@ define <8 x i64> @test_x86_avx512_mask_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, <8
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrlvq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)
ret <8 x i64> %res
@@ -1960,7 +1960,7 @@ define <16 x i32> @test_mask_add_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
ret < 16 x i32> %res
@@ -1991,7 +1991,7 @@ define <16 x i32> @test_mask_add_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <1
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%b = load <16 x i32>, <16 x i32>* %ptr_b
%res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
@@ -2026,7 +2026,7 @@ define <16 x i32> @test_mask_add_epi32_rmbk(<16 x i32> %a, i32* %ptr_b, <16 x i3
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpaddd (%rdi){1to16}, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
@@ -2064,7 +2064,7 @@ define <16 x i32> @test_mask_sub_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
ret < 16 x i32> %res
@@ -2095,7 +2095,7 @@ define <16 x i32> @test_mask_sub_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <1
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsubd (%rdi), %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%b = load <16 x i32>, <16 x i32>* %ptr_b
%res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
@@ -2130,7 +2130,7 @@ define <16 x i32> @test_mask_sub_epi32_rmbk(<16 x i32> %a, i32* %ptr_b, <16 x i3
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsubd (%rdi){1to16}, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
@@ -2168,7 +2168,7 @@ define <8 x i64> @test_mask_add_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64>
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
ret < 8 x i64> %res
@@ -2199,7 +2199,7 @@ define <8 x i64> @test_mask_add_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpaddq (%rdi), %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%b = load <8 x i64>, <8 x i64>* %ptr_b
%res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
@@ -2234,7 +2234,7 @@ define <8 x i64> @test_mask_add_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64>
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpaddq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
@@ -2272,7 +2272,7 @@ define <8 x i64> @test_mask_sub_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64>
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
ret < 8 x i64> %res
@@ -2303,7 +2303,7 @@ define <8 x i64> @test_mask_sub_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsubq (%rdi), %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%b = load <8 x i64>, <8 x i64>* %ptr_b
%res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
@@ -2338,7 +2338,7 @@ define <8 x i64> @test_mask_sub_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64>
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsubq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
@@ -2376,7 +2376,7 @@ define <8 x i64> @test_mask_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmuldq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
ret < 8 x i64> %res
@@ -2407,7 +2407,7 @@ define <8 x i64> @test_mask_mul_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuldq (%rdi), %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%b = load <16 x i32>, <16 x i32>* %ptr_b
%res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
@@ -2443,7 +2443,7 @@ define <8 x i64> @test_mask_mul_epi32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64>
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuldq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
@@ -2483,7 +2483,7 @@ define <8 x i64> @test_mask_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
ret < 8 x i64> %res
@@ -2514,7 +2514,7 @@ define <8 x i64> @test_mask_mul_epu32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuludq (%rdi), %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%b = load <16 x i32>, <16 x i32>* %ptr_b
%res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
@@ -2550,7 +2550,7 @@ define <8 x i64> @test_mask_mul_epu32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64>
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuludq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
@@ -2590,7 +2590,7 @@ define <16 x i32> @test_mask_mullo_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmulld %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
ret < 16 x i32> %res
@@ -2621,7 +2621,7 @@ define <16 x i32> @test_mask_mullo_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmulld (%rdi), %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%b = load <16 x i32>, <16 x i32>* %ptr_b
%res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
@@ -2656,7 +2656,7 @@ define <16 x i32> @test_mask_mullo_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <1
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmulld (%rdi){1to16}, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
@@ -3285,7 +3285,7 @@ define <2 x double> @test_mask_add_sd_rn(<2 x double> %a0, <2 x double> %a1, <2
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vaddsd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: vmovapd %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 0)
ret <2 x double> %res
@@ -3297,7 +3297,7 @@ define <2 x double> @test_mask_add_sd_rd(<2 x double> %a0, <2 x double> %a1, <2
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vaddsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: vmovapd %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 1)
ret <2 x double> %res
@@ -3309,7 +3309,7 @@ define <2 x double> @test_mask_add_sd_ru(<2 x double> %a0, <2 x double> %a1, <2
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vaddsd {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: vmovapd %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 2)
ret <2 x double> %res
@@ -3321,7 +3321,7 @@ define <2 x double> @test_mask_add_sd_rz(<2 x double> %a0, <2 x double> %a1, <2
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vaddsd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: vmovapd %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 3)
ret <2 x double> %res
@@ -3333,7 +3333,7 @@ define <2 x double> @test_mask_add_sd_current(<2 x double> %a0, <2 x double> %a1
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vaddsd %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: vmovapd %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
ret <2 x double> %res
@@ -3432,7 +3432,7 @@ define <2 x double> @test_mask_max_sd_sae(<2 x double> %a0, <2 x double> %a1, <2
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmaxsd {sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: vmovapd %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
ret <2 x double> %res
@@ -3464,7 +3464,7 @@ define <2 x double> @test_mask_max_sd(<2 x double> %a0, <2 x double> %a1, <2 x d
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmaxsd %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: vmovapd %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
ret <2 x double> %res
@@ -3788,7 +3788,7 @@ define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_d_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovaps %zmm1, %zmm3
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3
; CHECK-NEXT: vpermi2d (%rdi), %zmm0, %zmm3 {%k1}
; CHECK-NEXT: vpermi2d %zmm2, %zmm0, %zmm1
; CHECK-NEXT: vpaddd %zmm1, %zmm3, %zmm0
@@ -3806,7 +3806,7 @@ define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0,
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovaps %zmm1, %zmm3
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3
; CHECK-NEXT: vpermi2pd %zmm2, %zmm0, %zmm3 {%k1}
; CHECK-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1
; CHECK-NEXT: vaddpd %zmm1, %zmm3, %zmm0
@@ -3823,7 +3823,7 @@ define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0,
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovaps %zmm1, %zmm3
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3
; CHECK-NEXT: vpermi2ps %zmm2, %zmm0, %zmm3 {%k1}
; CHECK-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1
; CHECK-NEXT: vaddps %zmm1, %zmm3, %zmm0
@@ -3840,7 +3840,7 @@ define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovaps %zmm1, %zmm3
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3
; CHECK-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 {%k1}
; CHECK-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
; CHECK-NEXT: vpaddq %zmm1, %zmm3, %zmm0
@@ -3857,7 +3857,7 @@ define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovaps %zmm1, %zmm2
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm2
; CHECK-NEXT: vpermt2d (%rdi), %zmm0, %zmm2 {%k1} {z}
; CHECK-NEXT: vpermt2d %zmm1, %zmm0, %zmm1
; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm0
@@ -3875,7 +3875,7 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovaps %zmm1, %zmm2
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm2
; CHECK-NEXT: vpermt2pd (%rdi){1to8}, %zmm0, %zmm2 {%k1} {z}
; CHECK-NEXT: vpermt2pd %zmm1, %zmm0, %zmm1
; CHECK-NEXT: vaddpd %zmm1, %zmm2, %zmm0
@@ -3895,7 +3895,7 @@ define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0,
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_ps_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovaps %zmm1, %zmm3
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3
; CHECK-NEXT: vpermt2ps %zmm2, %zmm0, %zmm3 {%k1} {z}
; CHECK-NEXT: vpermt2ps %zmm2, %zmm0, %zmm1
; CHECK-NEXT: vaddps %zmm1, %zmm3, %zmm0
@@ -3913,7 +3913,7 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovaps %zmm1, %zmm3
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3
; CHECK-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 {%k1} {z}
; CHECK-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
; CHECK-NEXT: vpaddq %zmm1, %zmm3, %zmm0
@@ -3930,7 +3930,7 @@ define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16
; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovaps %zmm1, %zmm3
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3
; CHECK-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 {%k1}
; CHECK-NEXT: vpermt2d %zmm2, %zmm0, %zmm1
; CHECK-NEXT: vpaddd %zmm1, %zmm3, %zmm0
@@ -4956,7 +4956,7 @@ define <2 x double>@test_int_x86_avx512_mask_getmant_sd(<2 x double> %x0, <2 x d
; CHECK: ## BB#0:
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm3
; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm3 {%k1}
; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm4 {%k1} {z}
; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm5
@@ -5197,7 +5197,7 @@ define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x
; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_d_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm3 {%k1}
; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: vpaddd %zmm0, %zmm3, %zmm0
@@ -5214,7 +5214,7 @@ define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x
; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_d_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm3 {%k1} {z}
; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: vpaddd %zmm0, %zmm3, %zmm0
@@ -5231,7 +5231,7 @@ define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64
; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm3 {%k1}
; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: vpaddq %zmm0, %zmm3, %zmm0
@@ -5248,7 +5248,7 @@ define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i6
; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm3 {%k1} {z}
; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: vpaddq %zmm0, %zmm3, %zmm0
@@ -5995,7 +5995,7 @@ define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512(<8 x double> %x0, <
; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_pd_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
; CHECK-NEXT: vfixupimmpd $4, %zmm2, %zmm1, %zmm3 {%k1}
; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
; CHECK-NEXT: vfixupimmpd $5, %zmm2, %zmm1, %zmm4 {%k1} {z}
@@ -6017,10 +6017,10 @@ define <8 x double>@test_int_x86_avx512_maskz_fixupimm_pd_512(<8 x double> %x0,
; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_pd_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
; CHECK-NEXT: vfixupimmpd $3, %zmm2, %zmm1, %zmm3 {%k1} {z}
; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
-; CHECK-NEXT: vmovaps %zmm0, %zmm5
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm5
; CHECK-NEXT: vfixupimmpd $5, %zmm4, %zmm1, %zmm5 {%k1} {z}
; CHECK-NEXT: vfixupimmpd $2, {sae}, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: vaddpd %zmm5, %zmm3, %zmm1
@@ -6041,10 +6041,10 @@ define <4 x float>@test_int_x86_avx512_mask_fixupimm_ss(<4 x float> %x0, <4 x fl
; CHECK: ## BB#0:
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
; CHECK-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1}
; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vmovaps %zmm0, %zmm5
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm5
; CHECK-NEXT: vfixupimmss $5, %xmm4, %xmm1, %xmm5 {%k1}
; CHECK-NEXT: vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0
; CHECK-NEXT: vaddps %xmm5, %xmm3, %xmm1
@@ -6065,9 +6065,9 @@ define <4 x float>@test_int_x86_avx512_maskz_fixupimm_ss(<4 x float> %x0, <4 x f
; CHECK: ## BB#0:
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
; CHECK-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1} {z}
-; CHECK-NEXT: vmovaps %zmm0, %zmm4
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4
; CHECK-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm4
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
@@ -6088,10 +6088,10 @@ define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512(<16 x float> %x0, <
; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_ps_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
; CHECK-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm3 {%k1}
; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
-; CHECK-NEXT: vmovaps %zmm0, %zmm5
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm5
; CHECK-NEXT: vfixupimmps $5, %zmm4, %zmm1, %zmm5 {%k1}
; CHECK-NEXT: vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: vaddps %zmm5, %zmm3, %zmm1
@@ -6111,9 +6111,9 @@ define <16 x float>@test_int_x86_avx512_maskz_fixupimm_ps_512(<16 x float> %x0,
; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_ps_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
; CHECK-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm3 {%k1} {z}
-; CHECK-NEXT: vmovaps %zmm0, %zmm4
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4
; CHECK-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm4
; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
; CHECK-NEXT: vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
@@ -6135,9 +6135,9 @@ define <2 x double>@test_int_x86_avx512_mask_fixupimm_sd(<2 x double> %x0, <2 x
; CHECK: ## BB#0:
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
; CHECK-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1}
-; CHECK-NEXT: vmovaps %zmm0, %zmm4
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4
; CHECK-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm4
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1}
@@ -6159,10 +6159,10 @@ define <2 x double>@test_int_x86_avx512_maskz_fixupimm_sd(<2 x double> %x0, <2 x
; CHECK: ## BB#0:
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
; CHECK-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1} {z}
; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vmovaps %zmm0, %zmm5
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm5
; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm4, %xmm1, %xmm5 {%k1} {z}
; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: vaddpd %xmm5, %xmm3, %xmm1
diff --git a/llvm/test/CodeGen/X86/avx512-logic.ll b/llvm/test/CodeGen/X86/avx512-logic.ll
index d085467868a..c8d3b519425 100644
--- a/llvm/test/CodeGen/X86/avx512-logic.ll
+++ b/llvm/test/CodeGen/X86/avx512-logic.ll
@@ -157,7 +157,7 @@ define <64 x i8> @and_v64i8(<64 x i8> %a, <64 x i8> %b) {
;
; SKX-LABEL: and_v64i8:
; SKX: ## BB#0:
-; SKX-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; SKX-NEXT: vandps %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
%res = and <64 x i8> %a, %b
ret <64 x i8> %res
@@ -172,7 +172,7 @@ define <64 x i8> @andn_v64i8(<64 x i8> %a, <64 x i8> %b) {
;
; SKX-LABEL: andn_v64i8:
; SKX: ## BB#0:
-; SKX-NEXT: vpandnq %zmm0, %zmm1, %zmm0
+; SKX-NEXT: vandnps %zmm0, %zmm1, %zmm0
; SKX-NEXT: retq
%b2 = xor <64 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1,
i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1,
@@ -191,7 +191,7 @@ define <64 x i8> @or_v64i8(<64 x i8> %a, <64 x i8> %b) {
;
; SKX-LABEL: or_v64i8:
; SKX: ## BB#0:
-; SKX-NEXT: vporq %zmm1, %zmm0, %zmm0
+; SKX-NEXT: vorps %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
%res = or <64 x i8> %a, %b
ret <64 x i8> %res
@@ -206,7 +206,7 @@ define <64 x i8> @xor_v64i8(<64 x i8> %a, <64 x i8> %b) {
;
; SKX-LABEL: xor_v64i8:
; SKX: ## BB#0:
-; SKX-NEXT: vpxorq %zmm1, %zmm0, %zmm0
+; SKX-NEXT: vxorps %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
%res = xor <64 x i8> %a, %b
ret <64 x i8> %res
@@ -221,7 +221,7 @@ define <32 x i16> @and_v32i16(<32 x i16> %a, <32 x i16> %b) {
;
; SKX-LABEL: and_v32i16:
; SKX: ## BB#0:
-; SKX-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; SKX-NEXT: vandps %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
%res = and <32 x i16> %a, %b
ret <32 x i16> %res
@@ -236,7 +236,7 @@ define <32 x i16> @andn_v32i16(<32 x i16> %a, <32 x i16> %b) {
;
; SKX-LABEL: andn_v32i16:
; SKX: ## BB#0:
-; SKX-NEXT: vpandnq %zmm0, %zmm1, %zmm0
+; SKX-NEXT: vandnps %zmm0, %zmm1, %zmm0
; SKX-NEXT: retq
%b2 = xor <32 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1,
i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
@@ -253,7 +253,7 @@ define <32 x i16> @or_v32i16(<32 x i16> %a, <32 x i16> %b) {
;
; SKX-LABEL: or_v32i16:
; SKX: ## BB#0:
-; SKX-NEXT: vporq %zmm1, %zmm0, %zmm0
+; SKX-NEXT: vorps %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
%res = or <32 x i16> %a, %b
ret <32 x i16> %res
@@ -268,7 +268,7 @@ define <32 x i16> @xor_v32i16(<32 x i16> %a, <32 x i16> %b) {
;
; SKX-LABEL: xor_v32i16:
; SKX: ## BB#0:
-; SKX-NEXT: vpxorq %zmm1, %zmm0, %zmm0
+; SKX-NEXT: vxorps %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
%res = xor <32 x i16> %a, %b
ret <32 x i16> %res
diff --git a/llvm/test/CodeGen/X86/avx512-mov.ll b/llvm/test/CodeGen/X86/avx512-mov.ll
index 6b07e9e704d..7c5c028f060 100644
--- a/llvm/test/CodeGen/X86/avx512-mov.ll
+++ b/llvm/test/CodeGen/X86/avx512-mov.ll
@@ -231,7 +231,7 @@ define <8 x i64> @test23(i8 * %addr) {
define void @test24(i8 * %addr, <8 x double> %data) {
; CHECK-LABEL: test24:
; CHECK: ## BB#0:
-; CHECK-NEXT: vmovapd %zmm0, (%rdi) ## encoding: [0x62,0xf1,0xfd,0x48,0x29,0x07]
+; CHECK-NEXT: vmovaps %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x29,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x double>*
store <8 x double>%data, <8 x double>* %vaddr, align 64
@@ -241,7 +241,7 @@ define void @test24(i8 * %addr, <8 x double> %data) {
define <8 x double> @test25(i8 * %addr) {
; CHECK-LABEL: test25:
; CHECK: ## BB#0:
-; CHECK-NEXT: vmovapd (%rdi), %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0x07]
+; CHECK-NEXT: vmovaps (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x double>*
%res = load <8 x double>, <8 x double>* %vaddr, align 64
@@ -271,7 +271,7 @@ define <16 x float> @test27(i8 * %addr) {
define void @test28(i8 * %addr, <8 x double> %data) {
; CHECK-LABEL: test28:
; CHECK: ## BB#0:
-; CHECK-NEXT: vmovupd %zmm0, (%rdi) ## encoding: [0x62,0xf1,0xfd,0x48,0x11,0x07]
+; CHECK-NEXT: vmovups %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x double>*
store <8 x double>%data, <8 x double>* %vaddr, align 1
@@ -281,7 +281,7 @@ define void @test28(i8 * %addr, <8 x double> %data) {
define <8 x double> @test29(i8 * %addr) {
; CHECK-LABEL: test29:
; CHECK: ## BB#0:
-; CHECK-NEXT: vmovupd (%rdi), %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x10,0x07]
+; CHECK-NEXT: vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x double>*
%res = load <8 x double>, <8 x double>* %vaddr, align 1
diff --git a/llvm/test/CodeGen/X86/avx512-select.ll b/llvm/test/CodeGen/X86/avx512-select.ll
index 2ac91cc7482..fab6b3dda24 100644
--- a/llvm/test/CodeGen/X86/avx512-select.ll
+++ b/llvm/test/CodeGen/X86/avx512-select.ll
@@ -27,7 +27,7 @@ define <8 x i64> @select01(i32 %a, <8 x i64> %b) nounwind {
; CHECK-NEXT: ## BB#1:
; CHECK-NEXT: vmovaps %zmm0, %zmm1
; CHECK-NEXT: LBB1_2:
-; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vxorps %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%cmpres = icmp eq i32 %a, 255
%selres = select i1 %cmpres, <8 x i64> zeroinitializer, <8 x i64> %b
diff --git a/llvm/test/CodeGen/X86/avx512-vbroadcast.ll b/llvm/test/CodeGen/X86/avx512-vbroadcast.ll
index 005dc23ccf7..d7660e0b4ea 100644
--- a/llvm/test/CodeGen/X86/avx512-vbroadcast.ll
+++ b/llvm/test/CodeGen/X86/avx512-vbroadcast.ll
@@ -359,7 +359,7 @@ define <64 x i8> @_invec32xi8(<32 x i8>%a) {
; AVX512F-LABEL: _invec32xi8:
; AVX512F: # BB#0:
; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
-; AVX512F-NEXT: vmovaps %zmm0, %zmm1
+; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: _invec32xi8:
@@ -374,7 +374,7 @@ define <32 x i16> @_invec16xi16(<16 x i16>%a) {
; AVX512F-LABEL: _invec16xi16:
; AVX512F: # BB#0:
; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0
-; AVX512F-NEXT: vmovaps %zmm0, %zmm1
+; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: _invec16xi16:
diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll
index b131befcf0a..1fc7a129054 100644
--- a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -669,14 +669,14 @@ define <32 x i16> @test_mask_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpackssdw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT: vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi32_rrk_512:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackssdw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT: vmovaps %zmm2, %zmm0
+; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
ret <32 x i16> %res
@@ -719,7 +719,7 @@ define <32 x i16> @test_mask_packs_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpackssdw (%rdi), %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi32_rmk_512:
@@ -727,7 +727,7 @@ define <32 x i16> @test_mask_packs_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackssdw (%eax), %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%b = load <16 x i32>, <16 x i32>* %ptr_b
%res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
@@ -775,7 +775,7 @@ define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <3
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpackssdw (%rdi){1to16}, %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi32_rmbk_512:
@@ -783,7 +783,7 @@ define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <3
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackssdw (%eax){1to16}, %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
@@ -833,7 +833,7 @@ define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <6
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: kmovq %rdi, %k1
; AVX512BW-NEXT: vpacksswb %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT: vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi16_rrk_512:
@@ -842,7 +842,7 @@ define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <6
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
; AVX512F-32-NEXT: vpacksswb %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT: vmovaps %zmm2, %zmm0
+; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
ret <64 x i8> %res
@@ -887,7 +887,7 @@ define <64 x i8> @test_mask_packs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: kmovq %rsi, %k1
; AVX512BW-NEXT: vpacksswb (%rdi), %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi16_rmk_512:
@@ -897,7 +897,7 @@ define <64 x i8> @test_mask_packs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
; AVX512F-32-NEXT: vpacksswb (%eax), %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%b = load <32 x i16>, <32 x i16>* %ptr_b
%res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
@@ -946,14 +946,14 @@ define <32 x i16> @test_mask_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b,
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpackusdw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT: vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi32_rrk_512:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackusdw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT: vmovaps %zmm2, %zmm0
+; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
ret <32 x i16> %res
@@ -996,7 +996,7 @@ define <32 x i16> @test_mask_packus_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %pt
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpackusdw (%rdi), %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi32_rmk_512:
@@ -1004,7 +1004,7 @@ define <32 x i16> @test_mask_packus_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %pt
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackusdw (%eax), %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%b = load <16 x i32>, <16 x i32>* %ptr_b
%res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
@@ -1052,7 +1052,7 @@ define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpackusdw (%rdi){1to16}, %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi32_rmbk_512:
@@ -1060,7 +1060,7 @@ define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackusdw (%eax){1to16}, %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
@@ -1110,7 +1110,7 @@ define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: kmovq %rdi, %k1
; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT: vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi16_rrk_512:
@@ -1119,7 +1119,7 @@ define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
; AVX512F-32-NEXT: vpackuswb %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT: vmovaps %zmm2, %zmm0
+; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
ret <64 x i8> %res
@@ -1164,7 +1164,7 @@ define <64 x i8> @test_mask_packus_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: kmovq %rsi, %k1
; AVX512BW-NEXT: vpackuswb (%rdi), %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi16_rmk_512:
@@ -1174,7 +1174,7 @@ define <64 x i8> @test_mask_packus_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
; AVX512F-32-NEXT: vpackuswb (%eax), %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%b = load <32 x i16>, <32 x i16>* %ptr_b
%res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
@@ -1222,14 +1222,14 @@ define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <3
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT: vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_adds_epi16_rrk_512:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT: vmovaps %zmm2, %zmm0
+; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
ret <32 x i16> %res
@@ -1272,7 +1272,7 @@ define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_adds_epi16_rmk_512:
@@ -1280,7 +1280,7 @@ define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%b = load <32 x i16>, <32 x i16>* %ptr_b
%res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
@@ -1326,14 +1326,14 @@ define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <3
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT: vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_subs_epi16_rrk_512:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT: vmovaps %zmm2, %zmm0
+; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
ret <32 x i16> %res
@@ -1376,7 +1376,7 @@ define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_subs_epi16_rmk_512:
@@ -1384,7 +1384,7 @@ define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%b = load <32 x i16>, <32 x i16>* %ptr_b
%res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
@@ -1430,14 +1430,14 @@ define <32 x i16> @test_mask_adds_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <3
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT: vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_adds_epu16_rrk_512:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT: vmovaps %zmm2, %zmm0
+; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
ret <32 x i16> %res
@@ -1480,7 +1480,7 @@ define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_adds_epu16_rmk_512:
@@ -1488,7 +1488,7 @@ define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%b = load <32 x i16>, <32 x i16>* %ptr_b
%res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
@@ -1534,14 +1534,14 @@ define <32 x i16> @test_mask_subs_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <3
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT: vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_subs_epu16_rrk_512:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT: vmovaps %zmm2, %zmm0
+; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
ret <32 x i16> %res
@@ -1584,7 +1584,7 @@ define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_subs_epu16_rmk_512:
@@ -1592,7 +1592,7 @@ define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%b = load <32 x i16>, <32 x i16>* %ptr_b
%res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
@@ -1825,7 +1825,7 @@ define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32
; AVX512BW-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512:
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: kmovd %edi, %k1
-; AVX512BW-NEXT: vmovaps %zmm1, %zmm3
+; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3
; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 {%k1}
; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
; AVX512BW-NEXT: vpaddw %zmm1, %zmm3, %zmm0
@@ -1834,7 +1834,7 @@ define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32
; AVX512F-32-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: vmovaps %zmm1, %zmm3
+; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm3
; AVX512F-32-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 {%k1}
; AVX512F-32-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
; AVX512F-32-NEXT: vpaddw %zmm1, %zmm3, %zmm0
@@ -1851,7 +1851,7 @@ define <32 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <3
; AVX512BW-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512:
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: kmovd %edi, %k1
-; AVX512BW-NEXT: vmovaps %zmm1, %zmm3
+; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3
; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 {%k1} {z}
; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
; AVX512BW-NEXT: vpaddw %zmm1, %zmm3, %zmm0
@@ -1860,7 +1860,7 @@ define <32 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <3
; AVX512F-32-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: vmovaps %zmm1, %zmm3
+; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm3
; AVX512F-32-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 {%k1} {z}
; AVX512F-32-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
; AVX512F-32-NEXT: vpaddw %zmm1, %zmm3, %zmm0
@@ -1877,7 +1877,7 @@ define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32
; AVX512BW-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512:
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: kmovd %edi, %k1
-; AVX512BW-NEXT: vmovaps %zmm1, %zmm3
+; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3
; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm3 {%k1}
; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
; AVX512BW-NEXT: vpaddw %zmm1, %zmm3, %zmm0
@@ -1886,7 +1886,7 @@ define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32
; AVX512F-32-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: vmovaps %zmm1, %zmm3
+; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm3
; AVX512F-32-NEXT: vpermi2w %zmm2, %zmm0, %zmm3 {%k1}
; AVX512F-32-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
; AVX512F-32-NEXT: vpaddw %zmm1, %zmm3, %zmm0
diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll
index cf1b15e804f..f69bdc83346 100644
--- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll
@@ -2269,7 +2269,7 @@ define <32 x i16> @test_mask_add_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32
; CHECK: ## BB#0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfd,0xd1]
-; CHECK-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
ret <32 x i16> %res
@@ -2300,7 +2300,7 @@ define <32 x i16> @test_mask_add_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b
; CHECK: ## BB#0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpaddw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfd,0x0f]
-; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i16>, <32 x i16>* %ptr_b
%res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
@@ -2334,7 +2334,7 @@ define <32 x i16> @test_mask_sub_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32
; CHECK: ## BB#0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsubw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xf9,0xd1]
-; CHECK-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
ret <32 x i16> %res
@@ -2365,7 +2365,7 @@ define <32 x i16> @test_mask_sub_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b
; CHECK: ## BB#0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsubw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xf9,0x0f]
-; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i16>, <32 x i16>* %ptr_b
%res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
@@ -2399,7 +2399,7 @@ define <32 x i16> @test_mask_mullo_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <
; CHECK: ## BB#0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmullw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xd5,0xd1]
-; CHECK-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
ret <32 x i16> %res
@@ -2430,7 +2430,7 @@ define <32 x i16> @test_mask_mullo_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr
; CHECK: ## BB#0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpmullw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xd5,0x0f]
-; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i16>, <32 x i16>* %ptr_b
%res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
diff --git a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll
index f201082fb1a..0680290b040 100644
--- a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll
@@ -15,7 +15,7 @@ define <8 x i64> @test_mask_mullo_epi64_rrk_512(<8 x i64> %a, <8 x i64> %b, <8 x
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
; CHECK-NEXT: vpmullq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x40,0xd1]
-; CHECK-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
ret <8 x i64> %res
@@ -46,7 +46,7 @@ define <8 x i64> @test_mask_mullo_epi64_rmk_512(<8 x i64> %a, <8 x i64>* %ptr_b,
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
; CHECK-NEXT: vpmullq (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x40,0x0f]
-; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i64>, <8 x i64>* %ptr_b
%res = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
@@ -81,7 +81,7 @@ define <8 x i64> @test_mask_mullo_epi64_rmbk_512(<8 x i64> %a, i64* %ptr_b, <8 x
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
; CHECK-NEXT: vpmullq (%rdi){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x59,0x40,0x0f]
-; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
diff --git a/llvm/test/CodeGen/X86/avx512vbmi-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vbmi-intrinsics.ll
index ce999855d1f..afdd8c34ad4 100644
--- a/llvm/test/CodeGen/X86/avx512vbmi-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vbmi-intrinsics.ll
@@ -24,11 +24,14 @@ declare <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8>, <64 x i8>,
define <64 x i8>@test_int_x86_avx512_mask_pmultishift_qb_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmultishift_qb_512:
-; CHECK: vpmultishiftqb %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK: vpmultishiftqb %zmm1, %zmm0, %zmm3 {%k1} {z}
-; CHECK: vpmultishiftqb %zmm1, %zmm0, %zmm0
-; CHECK: vpaddb %zmm3, %zmm2, %zmm1
-; CHECK: vpaddb %zmm0, %zmm1, %zmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm3 {%k1} {z}
+; CHECK-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddb %zmm3, %zmm2, %zmm1
+; CHECK-NEXT: vpaddb %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
%res1 = call <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> zeroinitializer, i64 %x3)
%res2 = call <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
@@ -42,15 +45,15 @@ declare <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8>, <64 x i8>,
define <64 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovq %rdi, %k1
-; CHECK-NEXT: vmovaps %zmm1, %zmm3
-; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm3 {%k1}
-; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm1
-; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
-; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm4 {%k1} {z}
-; CHECK-NEXT: vpaddb %zmm4, %zmm3, %zmm0
-; CHECK-NEXT: vpaddb %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3
+; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm3 {%k1}
+; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm1
+; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm4 {%k1} {z}
+; CHECK-NEXT: vpaddb %zmm4, %zmm3, %zmm0
+; CHECK-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
%res = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
%res1 = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> zeroinitializer, <64 x i8> %x2, i64 %x3)
%res2 = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
@@ -64,15 +67,15 @@ declare <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8>, <64 x i8>,
define <64 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovq %rdi, %k1
-; CHECK-NEXT: vmovaps %zmm1, %zmm3
-; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm3 {%k1}
-; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm1
-; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
-; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm4 {%k1} {z}
-; CHECK-NEXT: vpaddb %zmm4, %zmm3, %zmm0
-; CHECK-NEXT: vpaddb %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3
+; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm3 {%k1}
+; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm1
+; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm4 {%k1} {z}
+; CHECK-NEXT: vpaddb %zmm4, %zmm3, %zmm0
+; CHECK-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
%res = call <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
%res1 = call <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> zeroinitializer, <64 x i8> %x2, i64 %x3)
%res2 = call <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
@@ -86,10 +89,10 @@ declare <64 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.512(<64 x i8>, <64 x i8>,
define <64 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovq %rdi, %k1
-; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm1 {%k1} {z}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm1 {%k1} {z}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <64 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
ret <64 x i8> %res
}
diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
index f9126b4614e..9126e659774 100644
--- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
@@ -2330,7 +2330,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32>, <8 x i32>, <8 x i
define <2 x i64> @test_mask_andnot_epi64_rr_128(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: test_mask_andnot_epi64_rr_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpandnq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xdf,0xc1]
+; CHECK-NEXT: vandnps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x55,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 -1)
ret <2 x i64> %res
@@ -2360,7 +2360,7 @@ define <2 x i64> @test_mask_andnot_epi64_rrkz_128(<2 x i64> %a, <2 x i64> %b, i8
define <2 x i64> @test_mask_andnot_epi64_rm_128(<2 x i64> %a, <2 x i64>* %ptr_b) {
; CHECK-LABEL: test_mask_andnot_epi64_rm_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpandnq (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xdf,0x07]
+; CHECK-NEXT: vandnps (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x55,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <2 x i64>, <2 x i64>* %ptr_b
%res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 -1)
@@ -2434,7 +2434,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64>, <2 x i64>, <2 x i
define <4 x i64> @test_mask_andnot_epi64_rr_256(<4 x i64> %a, <4 x i64> %b) {
; CHECK-LABEL: test_mask_andnot_epi64_rr_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpandnq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xdf,0xc1]
+; CHECK-NEXT: vandnps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x55,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 -1)
ret <4 x i64> %res
@@ -2464,7 +2464,7 @@ define <4 x i64> @test_mask_andnot_epi64_rrkz_256(<4 x i64> %a, <4 x i64> %b, i8
define <4 x i64> @test_mask_andnot_epi64_rm_256(<4 x i64> %a, <4 x i64>* %ptr_b) {
; CHECK-LABEL: test_mask_andnot_epi64_rm_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpandnq (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xdf,0x07]
+; CHECK-NEXT: vandnps (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x55,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i64>, <4 x i64>* %ptr_b
%res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 -1)
diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
index 41376cf602c..88747c935be 100644
--- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -857,7 +857,7 @@ declare <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4
define void @compr7(i8* %addr, <8 x double> %data) {
; CHECK-LABEL: compr7:
; CHECK: ## BB#0:
-; CHECK-NEXT: vmovupd %zmm0, (%rdi) ## encoding: [0x62,0xf1,0xfd,0x48,0x11,0x07]
+; CHECK-NEXT: vmovups %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 -1)
ret void
@@ -973,7 +973,7 @@ declare <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x
define <8 x double> @expand7(i8* %addr, <8 x double> %data) {
; CHECK-LABEL: expand7:
; CHECK: ## BB#0:
-; CHECK-NEXT: vmovupd (%rdi), %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x10,0x07]
+; CHECK-NEXT: vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 -1)
ret <8 x double> %res
diff --git a/llvm/test/CodeGen/X86/fma_patterns_wide.ll b/llvm/test/CodeGen/X86/fma_patterns_wide.ll
index 7b6509ad51c..bf9291c7119 100644
--- a/llvm/test/CodeGen/X86/fma_patterns_wide.ll
+++ b/llvm/test/CodeGen/X86/fma_patterns_wide.ll
@@ -244,7 +244,7 @@ define <8 x double> @test_8f64_fmsub_load(<8 x double>* %a0, <8 x double> %a1, <
; AVX512: # BB#0:
; AVX512-NEXT: vmovapd (%rdi), %zmm2
; AVX512-NEXT: vfmsub213pd %zmm1, %zmm0, %zmm2
-; AVX512-NEXT: vmovaps %zmm2, %zmm0
+; AVX512-NEXT: vmovapd %zmm2, %zmm0
; AVX512-NEXT: retq
%x = load <8 x double>, <8 x double>* %a0
%y = fmul <8 x double> %x, %a1
@@ -573,7 +573,7 @@ define <8 x double> @test_v8f64_interp(<8 x double> %x, <8 x double> %y, <8 x do
;
; AVX512-LABEL: test_v8f64_interp:
; AVX512: # BB#0:
-; AVX512-NEXT: vmovaps %zmm2, %zmm3
+; AVX512-NEXT: vmovapd %zmm2, %zmm3
; AVX512-NEXT: vfnmadd213pd %zmm1, %zmm1, %zmm3
; AVX512-NEXT: vfmadd213pd %zmm3, %zmm2, %zmm0
; AVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index 3b748eeb2e5..c2407bd4248 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -103,7 +103,7 @@ define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) {
; KNL_64: # BB#0:
; KNL_64-NEXT: kmovw %esi, %k1
; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
-; KNL_64-NEXT: vmovaps %zmm1, %zmm0
+; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test3:
@@ -111,14 +111,14 @@ define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) {
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
-; KNL_32-NEXT: vmovaps %zmm1, %zmm0
+; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0
; KNL_32-NEXT: retl
;
; SKX-LABEL: test3:
; SKX: # BB#0:
; SKX-NEXT: kmovw %esi, %k1
; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
-; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0
; SKX-NEXT: retq
%broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
@@ -138,7 +138,7 @@ define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
; KNL_64-NEXT: kmovw %esi, %k1
; KNL_64-NEXT: kmovw %k1, %k2
; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
-; KNL_64-NEXT: vmovaps %zmm1, %zmm2
+; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm2
; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
; KNL_64-NEXT: vpaddd %zmm2, %zmm1, %zmm0
; KNL_64-NEXT: retq
@@ -149,7 +149,7 @@ define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; KNL_32-NEXT: kmovw %k1, %k2
; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
-; KNL_32-NEXT: vmovaps %zmm1, %zmm2
+; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2
; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
; KNL_32-NEXT: retl
@@ -159,7 +159,7 @@ define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
; SKX-NEXT: kmovw %esi, %k1
; SKX-NEXT: kmovw %k1, %k2
; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
-; SKX-NEXT: vmovaps %zmm1, %zmm2
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm2
; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm0
; SKX-NEXT: retq
@@ -246,7 +246,7 @@ define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
; KNL_64-NEXT: kxnorw %k0, %k0, %k2
; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
-; KNL_64-NEXT: vmovaps %zmm2, %zmm0
+; KNL_64-NEXT: vmovdqa64 %zmm2, %zmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test6:
@@ -256,7 +256,7 @@ define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
; KNL_32-NEXT: kxnorw %k0, %k0, %k2
; KNL_32-NEXT: vpgatherqd (,%zmm2), %ymm1 {%k2}
; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm2) {%k1}
-; KNL_32-NEXT: vmovaps %zmm1, %zmm0
+; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0
; KNL_32-NEXT: retl
;
; SKX-LABEL: test6:
@@ -282,7 +282,7 @@ define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
; KNL_64-NEXT: kmovw %k1, %k2
; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k2}
-; KNL_64-NEXT: vmovaps %zmm1, %zmm2
+; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm2
; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1}
; KNL_64-NEXT: vpaddd %ymm2, %ymm1, %ymm0
; KNL_64-NEXT: retq
@@ -295,7 +295,7 @@ define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
; KNL_32-NEXT: kmovw %k1, %k2
; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k2}
-; KNL_32-NEXT: vmovaps %zmm1, %zmm2
+; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2
; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1}
; KNL_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0
; KNL_32-NEXT: retl
@@ -344,7 +344,7 @@ define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) {
; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; KNL_32-NEXT: kmovw %k1, %k2
; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2}
-; KNL_32-NEXT: vmovaps %zmm1, %zmm2
+; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2
; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
; KNL_32-NEXT: retl
@@ -369,7 +369,7 @@ define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) {
; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; SKX_32-NEXT: kmovw %k1, %k2
; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2}
-; SKX_32-NEXT: vmovaps %zmm1, %zmm2
+; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm2
; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
; SKX_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
; SKX_32-NEXT: retl
@@ -733,7 +733,7 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x
; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
-; KNL_64-NEXT: vmovaps %zmm2, %zmm0
+; KNL_64-NEXT: vmovapd %zmm2, %zmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test16:
@@ -748,7 +748,7 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x
; KNL_32-NEXT: vpsllvq .LCPI15_0, %zmm1, %zmm1
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
-; KNL_32-NEXT: vmovaps %zmm2, %zmm0
+; KNL_32-NEXT: vmovapd %zmm2, %zmm0
; KNL_32-NEXT: retl
;
; SKX-LABEL: test16:
@@ -783,7 +783,7 @@ define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x
; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
-; KNL_64-NEXT: vmovaps %zmm2, %zmm0
+; KNL_64-NEXT: vmovapd %zmm2, %zmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test17:
@@ -794,7 +794,7 @@ define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x
; KNL_32-NEXT: vpsllvq .LCPI16_0, %zmm1, %zmm1
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
-; KNL_32-NEXT: vmovaps %zmm2, %zmm0
+; KNL_32-NEXT: vmovapd %zmm2, %zmm0
; KNL_32-NEXT: retl
;
; SKX-LABEL: test17:
@@ -1080,7 +1080,7 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %
; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
-; KNL_64-NEXT: vmovaps %zmm2, %zmm0
+; KNL_64-NEXT: vmovdqa64 %zmm2, %zmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test23:
@@ -1091,7 +1091,7 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %
; KNL_32-NEXT: vpsllvq .LCPI22_0, %zmm1, %zmm1
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
-; KNL_32-NEXT: vmovaps %zmm2, %zmm0
+; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0
; KNL_32-NEXT: retl
;
; SKX-LABEL: test23:
@@ -1122,7 +1122,7 @@ define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
; KNL_64: movb $3, %al
; KNL_64-NEXT: kmovw %eax, %k1
; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
-; KNL_64-NEXT: vmovaps %zmm1, %zmm0
+; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test24:
@@ -1133,7 +1133,7 @@ define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
; KNL_32-NEXT: vpsllvq .LCPI23_1, %zmm1, %zmm1
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
-; KNL_32-NEXT: vmovaps %zmm1, %zmm0
+; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0
; KNL_32-NEXT: retl
;
; SKX-LABEL: test24:
@@ -1165,7 +1165,7 @@ define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %
; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
-; KNL_64-NEXT: vmovaps %zmm2, %zmm0
+; KNL_64-NEXT: vmovdqa64 %zmm2, %zmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test25:
@@ -1176,7 +1176,7 @@ define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %
; KNL_32-NEXT: vpsllvq .LCPI24_0, %zmm1, %zmm1
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
-; KNL_32-NEXT: vmovaps %zmm2, %zmm0
+; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0
; KNL_32-NEXT: retl
;
; SKX-LABEL: test25:
@@ -1208,7 +1208,7 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
; KNL_64: movb $3, %al
; KNL_64-NEXT: kmovw %eax, %k1
; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
-; KNL_64-NEXT: vmovaps %zmm1, %zmm0
+; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test26:
@@ -1219,7 +1219,7 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
; KNL_32-NEXT: vpsllvq .LCPI25_1, %zmm2, %zmm2
; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
-; KNL_32-NEXT: vmovaps %zmm1, %zmm0
+; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0
; KNL_32-NEXT: retl
;
; SKX-LABEL: test26:
@@ -1546,15 +1546,15 @@ define <16 x float*> @test31(<16 x float**> %ptrs) {
; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2}
; KNL_64-NEXT: kshiftrw $8, %k1, %k1
; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1}
-; KNL_64-NEXT: vmovaps %zmm2, %zmm0
-; KNL_64-NEXT: vmovaps %zmm3, %zmm1
+; KNL_64-NEXT: vmovdqa64 %zmm2, %zmm0
+; KNL_64-NEXT: vmovdqa64 %zmm3, %zmm1
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test31:
; KNL_32: # BB#0:
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
-; KNL_32-NEXT: vmovaps %zmm1, %zmm0
+; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0
; KNL_32-NEXT: retl
;
; SKX-LABEL: test31:
@@ -1564,15 +1564,15 @@ define <16 x float*> @test31(<16 x float**> %ptrs) {
; SKX-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2}
; SKX-NEXT: kshiftrw $8, %k1, %k1
; SKX-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1}
-; SKX-NEXT: vmovaps %zmm2, %zmm0
-; SKX-NEXT: vmovaps %zmm3, %zmm1
+; SKX-NEXT: vmovdqa64 %zmm2, %zmm0
+; SKX-NEXT: vmovdqa64 %zmm3, %zmm1
; SKX-NEXT: retq
;
; SKX_32-LABEL: test31:
; SKX_32: # BB#0:
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
-; SKX_32-NEXT: vmovaps %zmm1, %zmm0
+; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm0
; SKX_32-NEXT: retl
%res = call <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float*> undef)
@@ -1598,7 +1598,7 @@ define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i
; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
-; KNL_32-NEXT: vmovaps %zmm2, %zmm0
+; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_gather_16i32:
@@ -1619,7 +1619,7 @@ define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i
; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
-; SKX_32-NEXT: vmovaps %zmm2, %zmm0
+; SKX_32-NEXT: vmovdqa64 %zmm2, %zmm0
; SKX_32-NEXT: retl
%res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0)
ret <16 x i32> %res
@@ -1633,8 +1633,8 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i
; KNL_64-NEXT: kshiftrw $8, %k1, %k2
; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2}
-; KNL_64-NEXT: vmovaps %zmm3, %zmm0
-; KNL_64-NEXT: vmovaps %zmm4, %zmm1
+; KNL_64-NEXT: vmovdqa64 %zmm3, %zmm0
+; KNL_64-NEXT: vmovdqa64 %zmm4, %zmm1
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_gather_16i64:
@@ -1657,7 +1657,7 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i
; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1}
; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2}
-; KNL_32-NEXT: vmovaps %zmm2, %zmm0
+; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0
; KNL_32-NEXT: movl %ebp, %esp
; KNL_32-NEXT: popl %ebp
; KNL_32-NEXT: retl
@@ -1670,8 +1670,8 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
; SKX-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2}
-; SKX-NEXT: vmovaps %zmm3, %zmm0
-; SKX-NEXT: vmovaps %zmm4, %zmm1
+; SKX-NEXT: vmovdqa64 %zmm3, %zmm0
+; SKX-NEXT: vmovdqa64 %zmm4, %zmm1
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_gather_16i64:
@@ -1694,7 +1694,7 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i
; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1}
; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0
; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2}
-; SKX_32-NEXT: vmovaps %zmm2, %zmm0
+; SKX_32-NEXT: vmovdqa64 %zmm2, %zmm0
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
; SKX_32-NEXT: retl
@@ -1756,8 +1756,8 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <
; KNL_64-NEXT: kshiftrw $8, %k1, %k2
; KNL_64-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
; KNL_64-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2}
-; KNL_64-NEXT: vmovaps %zmm3, %zmm0
-; KNL_64-NEXT: vmovaps %zmm4, %zmm1
+; KNL_64-NEXT: vmovapd %zmm3, %zmm0
+; KNL_64-NEXT: vmovapd %zmm4, %zmm1
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_gather_16f64:
@@ -1780,7 +1780,7 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <
; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1}
; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2}
-; KNL_32-NEXT: vmovaps %zmm2, %zmm0
+; KNL_32-NEXT: vmovapd %zmm2, %zmm0
; KNL_32-NEXT: movl %ebp, %esp
; KNL_32-NEXT: popl %ebp
; KNL_32-NEXT: retl
@@ -1793,8 +1793,8 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
; SKX-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2}
-; SKX-NEXT: vmovaps %zmm3, %zmm0
-; SKX-NEXT: vmovaps %zmm4, %zmm1
+; SKX-NEXT: vmovapd %zmm3, %zmm0
+; SKX-NEXT: vmovapd %zmm4, %zmm1
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_gather_16f64:
@@ -1817,7 +1817,7 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <
; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1}
; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0
; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2}
-; SKX_32-NEXT: vmovaps %zmm2, %zmm0
+; SKX_32-NEXT: vmovapd %zmm2, %zmm0
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
; SKX_32-NEXT: retl
diff --git a/llvm/test/CodeGen/X86/masked_memop.ll b/llvm/test/CodeGen/X86/masked_memop.ll
index 8b5146e5e93..1b9f80faeae 100644
--- a/llvm/test/CodeGen/X86/masked_memop.ll
+++ b/llvm/test/CodeGen/X86/masked_memop.ll
@@ -200,7 +200,7 @@ define <8 x double> @test5(<8 x i32> %trigger, <8 x double>* %addr, <8 x double>
; AVX512F-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
; AVX512F-NEXT: vmovupd (%rdi), %zmm1 {%k1}
-; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: vmovapd %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; SKX-LABEL: test5:
@@ -208,7 +208,7 @@ define <8 x double> @test5(<8 x i32> %trigger, <8 x double>* %addr, <8 x double>
; SKX-NEXT: vpxord %ymm2, %ymm2, %ymm2
; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
; SKX-NEXT: vmovupd (%rdi), %zmm1 {%k1}
-; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: vmovapd %zmm1, %zmm0
; SKX-NEXT: retq
%mask = icmp eq <8 x i32> %trigger, zeroinitializer
%res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1>%mask, <8 x double>%dst)
@@ -501,7 +501,7 @@ define <8 x i32> @test11b(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %dst) {
; AVX512F-NEXT: kshiftlw $8, %k0, %k0
; AVX512F-NEXT: kshiftrw $8, %k0, %k1
; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm1 {%k1}
-; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; SKX-LABEL: test11b:
@@ -1314,7 +1314,7 @@ define void @one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) {
; AVX512-LABEL: one_mask_bit_set5:
; AVX512: ## BB#0:
; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
-; AVX512-NEXT: vmovlpd %xmm0, 48(%rdi)
+; AVX512-NEXT: vmovlps %xmm0, 48(%rdi)
; AVX512-NEXT: retq
call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %val, <8 x double>* %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false>)
ret void
@@ -1877,8 +1877,8 @@ define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64
; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1}
; AVX512F-NEXT: kshiftrw $8, %k1, %k1
; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1}
-; AVX512F-NEXT: vmovaps %zmm1, %zmm0
-; AVX512F-NEXT: vmovaps %zmm2, %zmm1
+; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1
; AVX512F-NEXT: retq
;
; SKX-LABEL: test_load_16i64:
@@ -1888,8 +1888,8 @@ define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64
; SKX-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1}
; SKX-NEXT: kshiftrw $8, %k1, %k1
; SKX-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1}
-; SKX-NEXT: vmovaps %zmm1, %zmm0
-; SKX-NEXT: vmovaps %zmm2, %zmm1
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT: vmovdqa64 %zmm2, %zmm1
; SKX-NEXT: retq
%res = call <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>* %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
ret <16 x i64> %res
@@ -1981,8 +1981,8 @@ define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16
; AVX512F-NEXT: vmovupd (%rdi), %zmm1 {%k1}
; AVX512F-NEXT: kshiftrw $8, %k1, %k1
; AVX512F-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
-; AVX512F-NEXT: vmovaps %zmm1, %zmm0
-; AVX512F-NEXT: vmovaps %zmm2, %zmm1
+; AVX512F-NEXT: vmovapd %zmm1, %zmm0
+; AVX512F-NEXT: vmovapd %zmm2, %zmm1
; AVX512F-NEXT: retq
;
; SKX-LABEL: test_load_16f64:
@@ -1992,8 +1992,8 @@ define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16
; SKX-NEXT: vmovupd (%rdi), %zmm1 {%k1}
; SKX-NEXT: kshiftrw $8, %k1, %k1
; SKX-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
-; SKX-NEXT: vmovaps %zmm1, %zmm0
-; SKX-NEXT: vmovaps %zmm2, %zmm1
+; SKX-NEXT: vmovapd %zmm1, %zmm0
+; SKX-NEXT: vmovapd %zmm2, %zmm1
; SKX-NEXT: retq
%res = call <16 x double> @llvm.masked.load.v16f64.p0v16f64(<16 x double>* %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
ret <16 x double> %res
@@ -2204,10 +2204,10 @@ define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32
; AVX512F-NEXT: vmovupd 192(%rdi), %zmm4 {%k1}
; AVX512F-NEXT: kshiftrw $8, %k2, %k1
; AVX512F-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
-; AVX512F-NEXT: vmovaps %zmm1, %zmm0
-; AVX512F-NEXT: vmovaps %zmm2, %zmm1
-; AVX512F-NEXT: vmovaps %zmm3, %zmm2
-; AVX512F-NEXT: vmovaps %zmm4, %zmm3
+; AVX512F-NEXT: vmovapd %zmm1, %zmm0
+; AVX512F-NEXT: vmovapd %zmm2, %zmm1
+; AVX512F-NEXT: vmovapd %zmm3, %zmm2
+; AVX512F-NEXT: vmovapd %zmm4, %zmm3
; AVX512F-NEXT: retq
;
; SKX-LABEL: test_load_32f64:
@@ -2221,10 +2221,10 @@ define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32
; SKX-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
; SKX-NEXT: kshiftrw $8, %k2, %k1
; SKX-NEXT: vmovupd 192(%rdi), %zmm4 {%k1}
-; SKX-NEXT: vmovaps %zmm1, %zmm0
-; SKX-NEXT: vmovaps %zmm2, %zmm1
-; SKX-NEXT: vmovaps %zmm3, %zmm2
-; SKX-NEXT: vmovaps %zmm4, %zmm3
+; SKX-NEXT: vmovapd %zmm1, %zmm0
+; SKX-NEXT: vmovapd %zmm2, %zmm1
+; SKX-NEXT: vmovapd %zmm3, %zmm2
+; SKX-NEXT: vmovapd %zmm4, %zmm3
; SKX-NEXT: retq
%res = call <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0)
ret <32 x double> %res
@@ -5538,7 +5538,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x
; SKX-NEXT: vpsllw $7, %zmm0, %zmm0
; SKX-NEXT: vpmovb2m %zmm0, %k1
; SKX-NEXT: vmovdqu8 (%rdi), %zmm1 {%k1}
-; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0
; SKX-NEXT: retq
%res = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* %addr, i32 4, <64 x i1>%mask, <64 x i8> %val)
ret <64 x i8> %res
@@ -6912,7 +6912,7 @@ define <32 x i16> @test_mask_load_32xi16(<32 x i1> %mask, <32 x i16>* %addr, <32
; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
; SKX-NEXT: vpmovb2m %ymm0, %k1
; SKX-NEXT: vmovdqu16 (%rdi), %zmm1 {%k1}
-; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0
; SKX-NEXT: retq
%res = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* %addr, i32 4, <32 x i1>%mask, <32 x i16> %val)
ret <32 x i16> %res
diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll
index bb9a342ae9a..bc06d8f1904 100644
--- a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll
+++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll
@@ -83,13 +83,13 @@ define <8 x double> @merge_8f64_4f64_z2(<4 x double>* %ptr) nounwind uwtable noi
define <8 x double> @merge_8f64_f64_23uuuuu9(double* %ptr) nounwind uwtable noinline ssp {
; ALL-LABEL: merge_8f64_f64_23uuuuu9:
; ALL: # BB#0:
-; ALL-NEXT: vmovupd 16(%rdi), %zmm0
+; ALL-NEXT: vmovups 16(%rdi), %zmm0
; ALL-NEXT: retq
;
; X32-AVX512F-LABEL: merge_8f64_f64_23uuuuu9:
; X32-AVX512F: # BB#0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512F-NEXT: vmovupd 16(%eax), %zmm0
+; X32-AVX512F-NEXT: vmovups 16(%eax), %zmm0
; X32-AVX512F-NEXT: retl
%ptr0 = getelementptr inbounds double, double* %ptr, i64 2
%ptr1 = getelementptr inbounds double, double* %ptr, i64 3
@@ -138,7 +138,7 @@ define <8 x double> @merge_8f64_f64_12zzuuzz(double* %ptr) nounwind uwtable noin
define <8 x double> @merge_8f64_f64_1u3u5zu8(double* %ptr) nounwind uwtable noinline ssp {
; ALL-LABEL: merge_8f64_f64_1u3u5zu8:
; ALL: # BB#0:
-; ALL-NEXT: vmovupd 8(%rdi), %zmm0
+; ALL-NEXT: vmovdqu64 8(%rdi), %zmm0
; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1
; ALL-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,u,2,u,4,13,u,7>
; ALL-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
@@ -147,7 +147,7 @@ define <8 x double> @merge_8f64_f64_1u3u5zu8(double* %ptr) nounwind uwtable noin
; X32-AVX512F-LABEL: merge_8f64_f64_1u3u5zu8:
; X32-AVX512F: # BB#0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512F-NEXT: vmovupd 8(%eax), %zmm0
+; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0
; X32-AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
; X32-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,0,u,u,2,0,u,u,4,0,13,0,u,u,7,0>
; X32-AVX512F-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
@@ -334,7 +334,7 @@ define <16 x float> @merge_16f32_f32_0uu3uuuuuuuuCuEF(float* %ptr) nounwind uwta
define <16 x float> @merge_16f32_f32_0uu3zzuuuuuzCuEF(float* %ptr) nounwind uwtable noinline ssp {
; ALL-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF:
; ALL: # BB#0:
-; ALL-NEXT: vmovups (%rdi), %zmm0
+; ALL-NEXT: vmovdqu64 (%rdi), %zmm0
; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1
; ALL-NEXT: vmovdqa32 {{.*#+}} zmm2 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>
; ALL-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0
@@ -343,7 +343,7 @@ define <16 x float> @merge_16f32_f32_0uu3zzuuuuuzCuEF(float* %ptr) nounwind uwta
; X32-AVX512F-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF:
; X32-AVX512F: # BB#0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512F-NEXT: vmovups (%eax), %zmm0
+; X32-AVX512F-NEXT: vmovdqu64 (%eax), %zmm0
; X32-AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
; X32-AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>
; X32-AVX512F-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0
diff --git a/llvm/test/CodeGen/X86/nontemporal-2.ll b/llvm/test/CodeGen/X86/nontemporal-2.ll
index e221f8e9520..b97d38e18f4 100644
--- a/llvm/test/CodeGen/X86/nontemporal-2.ll
+++ b/llvm/test/CodeGen/X86/nontemporal-2.ll
@@ -117,7 +117,7 @@ define void @test_zero_v4f32(<4 x float>* %dst) {
; VLX-LABEL: test_zero_v4f32:
; VLX: # BB#0:
; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0
-; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: vmovntps %xmm0, (%rdi)
; VLX-NEXT: retq
store <4 x float> zeroinitializer, <4 x float>* %dst, align 16, !nontemporal !1
ret void
@@ -139,7 +139,7 @@ define void @test_zero_v4i32(<4 x i32>* %dst) {
; VLX-LABEL: test_zero_v4i32:
; VLX: # BB#0:
; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0
-; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: vmovntps %xmm0, (%rdi)
; VLX-NEXT: retq
store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 16, !nontemporal !1
store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 16, !nontemporal !1
@@ -162,7 +162,7 @@ define void @test_zero_v2f64(<2 x double>* %dst) {
; VLX-LABEL: test_zero_v2f64:
; VLX: # BB#0:
; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0
-; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: vmovntps %xmm0, (%rdi)
; VLX-NEXT: retq
store <2 x double> zeroinitializer, <2 x double>* %dst, align 16, !nontemporal !1
ret void
@@ -184,7 +184,7 @@ define void @test_zero_v2i64(<2 x i64>* %dst) {
; VLX-LABEL: test_zero_v2i64:
; VLX: # BB#0:
; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0
-; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: vmovntps %xmm0, (%rdi)
; VLX-NEXT: retq
store <2 x i64> zeroinitializer, <2 x i64>* %dst, align 16, !nontemporal !1
ret void
@@ -206,7 +206,7 @@ define void @test_zero_v8i16(<8 x i16>* %dst) {
; VLX-LABEL: test_zero_v8i16:
; VLX: # BB#0:
; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0
-; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: vmovntps %xmm0, (%rdi)
; VLX-NEXT: retq
store <8 x i16> zeroinitializer, <8 x i16>* %dst, align 16, !nontemporal !1
ret void
@@ -228,7 +228,7 @@ define void @test_zero_v16i8(<16 x i8>* %dst) {
; VLX-LABEL: test_zero_v16i8:
; VLX: # BB#0:
; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0
-; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: vmovntps %xmm0, (%rdi)
; VLX-NEXT: retq
store <16 x i8> zeroinitializer, <16 x i8>* %dst, align 16, !nontemporal !1
ret void
@@ -657,7 +657,7 @@ define void @test_arg_v4i32(<4 x i32> %arg, <4 x i32>* %dst) {
;
; VLX-LABEL: test_arg_v4i32:
; VLX: # BB#0:
-; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: vmovntps %xmm0, (%rdi)
; VLX-NEXT: retq
store <4 x i32> %arg, <4 x i32>* %dst, align 16, !nontemporal !1
ret void
@@ -676,7 +676,7 @@ define void @test_arg_v2f64(<2 x double> %arg, <2 x double>* %dst) {
;
; VLX-LABEL: test_arg_v2f64:
; VLX: # BB#0:
-; VLX-NEXT: vmovntpd %xmm0, (%rdi)
+; VLX-NEXT: vmovntps %xmm0, (%rdi)
; VLX-NEXT: retq
store <2 x double> %arg, <2 x double>* %dst, align 16, !nontemporal !1
ret void
@@ -695,7 +695,7 @@ define void @test_arg_v2i64(<2 x i64> %arg, <2 x i64>* %dst) {
;
; VLX-LABEL: test_arg_v2i64:
; VLX: # BB#0:
-; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: vmovntps %xmm0, (%rdi)
; VLX-NEXT: retq
store <2 x i64> %arg, <2 x i64>* %dst, align 16, !nontemporal !1
ret void
@@ -714,7 +714,7 @@ define void @test_arg_v8i16(<8 x i16> %arg, <8 x i16>* %dst) {
;
; VLX-LABEL: test_arg_v8i16:
; VLX: # BB#0:
-; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: vmovntps %xmm0, (%rdi)
; VLX-NEXT: retq
store <8 x i16> %arg, <8 x i16>* %dst, align 16, !nontemporal !1
ret void
@@ -733,7 +733,7 @@ define void @test_arg_v16i8(<16 x i8> %arg, <16 x i8>* %dst) {
;
; VLX-LABEL: test_arg_v16i8:
; VLX: # BB#0:
-; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: vmovntps %xmm0, (%rdi)
; VLX-NEXT: retq
store <16 x i8> %arg, <16 x i8>* %dst, align 16, !nontemporal !1
ret void
diff --git a/llvm/test/CodeGen/X86/nontemporal-loads.ll b/llvm/test/CodeGen/X86/nontemporal-loads.ll
index 83301e60a1c..deb0cb9bbb2 100644
--- a/llvm/test/CodeGen/X86/nontemporal-loads.ll
+++ b/llvm/test/CodeGen/X86/nontemporal-loads.ll
@@ -1536,7 +1536,7 @@ define <8 x double> @test_unaligned_v8f64(<8 x double>* %src) {
;
; AVX512-LABEL: test_unaligned_v8f64:
; AVX512: # BB#0:
-; AVX512-NEXT: vmovupd (%rdi), %zmm0
+; AVX512-NEXT: vmovups (%rdi), %zmm0
; AVX512-NEXT: retq
%1 = load <8 x double>, <8 x double>* %src, align 1, !nontemporal !1
ret <8 x double> %1
diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll
index 5f2c88d670a..5ae20640106 100644
--- a/llvm/test/CodeGen/X86/pmul.ll
+++ b/llvm/test/CodeGen/X86/pmul.ll
@@ -391,7 +391,7 @@ define <2 x i64> @mul_v2i64spill(<2 x i64> %i, <2 x i64> %j) nounwind {
; AVX512-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload
; AVX512-NEXT: vpmuludq %xmm2, %xmm4, %xmm0
; AVX512-NEXT: vpsrlq $32, %xmm2, %xmm1
-; AVX512-NEXT: vmovaps %zmm2, %zmm3
+; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3
; AVX512-NEXT: vpmuludq %xmm1, %xmm4, %xmm1
; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX512-NEXT: vpsrlq $32, %xmm4, %xmm2
diff --git a/llvm/test/CodeGen/X86/scalar-int-to-fp.ll b/llvm/test/CodeGen/X86/scalar-int-to-fp.ll
index 4a16c3198aa..9ea86b08f7a 100644
--- a/llvm/test/CodeGen/X86/scalar-int-to-fp.ll
+++ b/llvm/test/CodeGen/X86/scalar-int-to-fp.ll
@@ -75,7 +75,7 @@ define x86_fp80 @s32_to_x(i32 %a) nounwind {
; CHECK-LABEL: u64_to_f
; AVX512_32: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512_32: vmovlpd %xmm0, {{[0-9]+}}(%esp)
+; AVX512_32: vmovlps %xmm0, {{[0-9]+}}(%esp)
; AVX512_32: fildll
; AVX512_64: vcvtusi2ssq
@@ -111,7 +111,7 @@ define float @s64_to_f(i64 %a) nounwind {
; AVX512_32: vmovd %eax, %xmm0
; AVX512_32: vpinsrd $1, %ecx, %xmm0, %xmm0
-; AVX512_32: vmovlpd %xmm0, {{[0-9]+}}(%esp)
+; AVX512_32: vmovq %xmm0, {{[0-9]+}}(%esp)
; AVX512_32: fildll {{[0-9]+}}(%esp)
define float @s64_to_f_2(i64 %a) nounwind {
@@ -151,7 +151,7 @@ define double @s64_to_d(i64 %a) nounwind {
; AVX512_32: vmovd %eax, %xmm0
; AVX512_32: vpinsrd $1, %ecx, %xmm0, %xmm0
-; AVX512_32: vmovlpd %xmm0, {{[0-9]+}}(%esp)
+; AVX512_32: vmovq %xmm0, {{[0-9]+}}(%esp)
; AVX512_32: fildll
define double @s64_to_d_2(i64 %a) nounwind {
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
index 5dec0b8fa11..b457969d31e 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
@@ -76,7 +76,7 @@ declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind read
define <2 x double> @stack_fold_andpd(<2 x double> %a0, <2 x double> %a1) {
;CHECK-LABEL: stack_fold_andpd
- ;CHECK: vpandq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ ;CHECK: vandpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = bitcast <2 x double> %a0 to <2 x i64>
%3 = bitcast <2 x double> %a1 to <2 x i64>
@@ -89,7 +89,7 @@ define <2 x double> @stack_fold_andpd(<2 x double> %a0, <2 x double> %a1) {
define <4 x double> @stack_fold_andpd_ymm(<4 x double> %a0, <4 x double> %a1) {
;CHECK-LABEL: stack_fold_andpd_ymm
- ;CHECK: vpandq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ ;CHECK: vandpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = bitcast <4 x double> %a0 to <4 x i64>
%3 = bitcast <4 x double> %a1 to <4 x i64>
@@ -198,7 +198,7 @@ declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) nounwind read
define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) {
;CHECK-LABEL: stack_fold_orpd
- ;CHECK: vporq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ ;CHECK: vorpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = bitcast <2 x double> %a0 to <2 x i64>
%3 = bitcast <2 x double> %a1 to <2 x i64>
@@ -211,7 +211,7 @@ define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) {
define <4 x double> @stack_fold_orpd_ymm(<4 x double> %a0, <4 x double> %a1) {
;CHECK-LABEL: stack_fold_orpd_ymm
- ;CHECK: vporq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ ;CHECK: vorpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = bitcast <4 x double> %a0 to <4 x i64>
%3 = bitcast <4 x double> %a1 to <4 x i64>
@@ -316,7 +316,7 @@ declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) nounwind read
define <2 x double> @stack_fold_xorpd(<2 x double> %a0, <2 x double> %a1) {
;CHECK-LABEL: stack_fold_xorpd
- ;CHECK: vpxorq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ ;CHECK: vxorpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = bitcast <2 x double> %a0 to <2 x i64>
%3 = bitcast <2 x double> %a1 to <2 x i64>
@@ -329,7 +329,7 @@ define <2 x double> @stack_fold_xorpd(<2 x double> %a0, <2 x double> %a1) {
define <4 x double> @stack_fold_xorpd_ymm(<4 x double> %a0, <4 x double> %a1) {
;CHECK-LABEL: stack_fold_xorpd_ymm
- ;CHECK: vpxorq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ ;CHECK: vxorpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = bitcast <4 x double> %a0 to <4 x i64>
%3 = bitcast <4 x double> %a1 to <4 x i64>
diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll
index b091d1bca2e..3261e988ffb 100644
--- a/llvm/test/CodeGen/X86/vector-half-conversions.ll
+++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll
@@ -3001,7 +3001,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) {
; AVX512-NEXT: .cfi_offset %r14, -24
; AVX512-NEXT: .Ltmp24:
; AVX512-NEXT: .cfi_offset %r15, -16
-; AVX512-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill
+; AVX512-NEXT: vmovupd %zmm0, (%rsp) # 64-byte Spill
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512-NEXT: callq __truncdfhf2
; AVX512-NEXT: movw %ax, %bx
@@ -3011,9 +3011,9 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) {
; AVX512-NEXT: callq __truncdfhf2
; AVX512-NEXT: movzwl %ax, %r15d
; AVX512-NEXT: orl %ebx, %r15d
-; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovupd (%rsp), %zmm0 # 64-byte Reload
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX512-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512-NEXT: callq __truncdfhf2
; AVX512-NEXT: movw %ax, %bx
@@ -3024,7 +3024,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) {
; AVX512-NEXT: orl %ebx, %r14d
; AVX512-NEXT: shlq $32, %r14
; AVX512-NEXT: orq %r15, %r14
-; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovupd (%rsp), %zmm0 # 64-byte Reload
; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -3862,17 +3862,17 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) {
; AVX512-NEXT: .Ltmp67:
; AVX512-NEXT: .cfi_offset %rbp, -16
; AVX512-NEXT: movq %rdi, %rbx
-; AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) # 64-byte Spill
+; AVX512-NEXT: vmovupd %zmm0, {{[0-9]+}}(%rsp) # 64-byte Spill
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512-NEXT: callq __truncdfhf2
; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
-; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX512-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512-NEXT: callq __truncdfhf2
; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
-; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; AVX512-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
index d7518495134..4d441aa67ce 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
@@ -262,7 +262,7 @@ define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a
define <8 x float> @shuffle_v16f32_extract_256(float* %RET, float* %a) {
; ALL-LABEL: shuffle_v16f32_extract_256:
; ALL: # BB#0:
-; ALL-NEXT: vmovups (%rsi), %zmm0
+; ALL-NEXT: vmovupd (%rsi), %zmm0
; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; ALL-NEXT: retq
%ptr_a = bitcast float* %a to <16 x float>*
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
index d39961d9c42..da8ea83ee94 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -268,14 +268,14 @@ define <8 x double> @shuffle_v8f64_8823cc67(<8 x double> %a, <8 x double> %b) {
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,10,11,4,4,14,15]
; AVX512F-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1
-; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_8823cc67:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,10,0,11,0,4,0,4,0,14,0,15,0]
; AVX512F-32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1
-; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7>
ret <8 x double> %shuffle
@@ -287,14 +287,14 @@ define <8 x double> @shuffle_v8f64_9832dc76(<8 x double> %a, <8 x double> %b) {
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,11,10,5,4,15,14]
; AVX512F-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1
-; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_9832dc76:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,0,0,11,0,10,0,5,0,4,0,15,0,14,0]
; AVX512F-32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1
-; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
ret <8 x double> %shuffle
@@ -306,14 +306,14 @@ define <8 x double> @shuffle_v8f64_9810dc54(<8 x double> %a, <8 x double> %b) {
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,9,8,5,4,13,12]
; AVX512F-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1
-; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_9810dc54:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,0,0,9,0,8,0,5,0,4,0,13,0,12,0]
; AVX512F-32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1
-; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
ret <8 x double> %shuffle
@@ -376,14 +376,14 @@ define <8 x double> @shuffle_v8f64_08991abb(<8 x double> %a, <8 x double> %b) {
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,0,1,1,9,2,3,3]
; AVX512F-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1
-; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_08991abb:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,0,0,0,1,0,1,0,9,0,2,0,3,0,3,0]
; AVX512F-32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1
-; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
ret <8 x double> %shuffle
@@ -412,14 +412,14 @@ define <8 x double> @shuffle_v8f64_09ab1def(<8 x double> %a, <8 x double> %b) {
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,1,2,3,9,5,6,7]
; AVX512F-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1
-; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_09ab1def:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,0,1,0,2,0,3,0,9,0,5,0,6,0,7,0]
; AVX512F-32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1
-; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
ret <8 x double> %shuffle
@@ -933,14 +933,14 @@ define <8 x double> @shuffle_v8f64_c348cda0(<8 x double> %a, <8 x double> %b) {
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,11,12,0,4,5,2,8]
; AVX512F-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1
-; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_c348cda0:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,0,11,0,12,0,0,0,4,0,5,0,2,0,8,0]
; AVX512F-32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1
-; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 12, i32 3, i32 4, i32 8, i32 12, i32 13, i32 10, i32 0>
ret <8 x double> %shuffle
@@ -1191,14 +1191,14 @@ define <8 x i64> @shuffle_v8i64_81a3c5e7(<8 x i64> %a, <8 x i64> %b) {
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,2,11,4,13,6,15]
; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
-; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_81a3c5e7:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,9,0,2,0,11,0,4,0,13,0,6,0,15,0]
; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
-; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
ret <8 x i64> %shuffle
@@ -1244,14 +1244,14 @@ define <8 x i64> @shuffle_v8i64_8823cc67(<8 x i64> %a, <8 x i64> %b) {
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,10,11,4,4,14,15]
; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
-; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_8823cc67:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,10,0,11,0,4,0,4,0,14,0,15,0]
; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
-; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7>
ret <8 x i64> %shuffle
@@ -1263,14 +1263,14 @@ define <8 x i64> @shuffle_v8i64_9832dc76(<8 x i64> %a, <8 x i64> %b) {
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,11,10,5,4,15,14]
; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
-; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_9832dc76:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,0,0,11,0,10,0,5,0,4,0,15,0,14,0]
; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
-; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
ret <8 x i64> %shuffle
@@ -1282,14 +1282,14 @@ define <8 x i64> @shuffle_v8i64_9810dc54(<8 x i64> %a, <8 x i64> %b) {
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,9,8,5,4,13,12]
; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
-; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_9810dc54:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,0,0,9,0,8,0,5,0,4,0,13,0,12,0]
; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
-; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
ret <8 x i64> %shuffle
@@ -1352,14 +1352,14 @@ define <8 x i64> @shuffle_v8i64_08991abb(<8 x i64> %a, <8 x i64> %b) {
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,0,1,1,9,2,3,3]
; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
-; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_08991abb:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,0,0,0,1,0,1,0,9,0,2,0,3,0,3,0]
; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
-; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
ret <8 x i64> %shuffle
@@ -1388,14 +1388,14 @@ define <8 x i64> @shuffle_v8i64_09ab1def(<8 x i64> %a, <8 x i64> %b) {
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,1,2,3,9,5,6,7]
; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
-; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_09ab1def:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,0,1,0,2,0,3,0,9,0,5,0,6,0,7,0]
; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
-; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
ret <8 x i64> %shuffle
@@ -1925,14 +1925,14 @@ define <8 x i64> @shuffle_v8i64_6caa87e5(<8 x i64> %a, <8 x i64> %b) {
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,4,2,2,0,15,6,13]
; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
-; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_6caa87e5:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,0,4,0,2,0,2,0,0,0,15,0,6,0,13,0]
; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
-; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 6, i32 12, i32 10, i32 10, i32 8, i32 7, i32 14, i32 5>
ret <8 x i64> %shuffle
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
index baf1054170b..3e0c3e4d8f3 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
@@ -33,7 +33,7 @@ define <8 x double> @combine_permvar_8f64_identity_mask(<8 x double> %x0, <8 x d
; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
; CHECK-NEXT: vpermpd %zmm1, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%res0 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x1, i8 %m)
%res1 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %res0, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, i8 %m)
@@ -56,7 +56,7 @@ define <8 x i64> @combine_permvar_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x
; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
; CHECK-NEXT: vpermq %zmm1, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%res0 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x1, i8 %m)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %res0, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, i8 %m)
@@ -168,10 +168,10 @@ define <16 x float> @combine_vpermt2var_16f32_vmovddup(<16 x float> %x0, <16 x f
define <16 x float> @combine_vpermt2var_16f32_vmovddup_load(<16 x float> *%p0, <16 x float> %x1) {
; CHECK-LABEL: combine_vpermt2var_16f32_vmovddup_load:
; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps (%rdi), %zmm1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%x0 = load <16 x float>, <16 x float> *%p0
%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 -1)
@@ -191,10 +191,10 @@ define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask_load(<16 x float> *%
; CHECK-LABEL: combine_vpermt2var_16f32_vmovddup_mask_load:
; CHECK: # BB#0:
; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovaps (%rdi), %zmm1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 {%k1} {z}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%x0 = load <16 x float>, <16 x float> *%p0
%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 %m)
@@ -365,7 +365,7 @@ define <64 x i8> @combine_pshufb_identity_mask(<64 x i8> %x0, i64 %m) {
; CHECK-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3
; CHECK-NEXT: vpshufb %zmm2, %zmm0, %zmm3 {%k1}
; CHECK-NEXT: vpshufb %zmm2, %zmm3, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%select = bitcast <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1> to <64 x i8>
%mask = bitcast <16 x i32> <i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051> to <64 x i8>
@@ -414,7 +414,7 @@ define <8 x i64> @combine_permvar_8i64_as_permq_mask(<8 x i64> %x0, <8 x i64> %x
; CHECK: # BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%1 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>, <8 x i64> %x1, i8 %m)
ret <8 x i64> %1
@@ -433,7 +433,7 @@ define <8 x double> @combine_permvar_8f64_as_permpd_mask(<8 x double> %x0, <8 x
; CHECK: # BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%1 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>, <8 x double> %x1, i8 %m)
ret <8 x double> %1
diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll
index a7794afba3d..dde11968030 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-math.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll
@@ -3074,7 +3074,7 @@ define <8 x i16> @trunc_and_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
;
; AVX512-LABEL: trunc_and_v8i64_8i16:
; AVX512: # BB#0:
-; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vandps %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: retq
%1 = and <8 x i64> %a0, %a1
@@ -3213,8 +3213,8 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
;
; AVX512-LABEL: trunc_and_v16i64_v16i8:
; AVX512: # BB#0:
-; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1
-; AVX512-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT: vandps %zmm3, %zmm1, %zmm1
+; AVX512-NEXT: vandps %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm1, %ymm1
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -3445,7 +3445,7 @@ define <8 x i16> @trunc_and_const_v16i64_v16i16(<8 x i64> %a0) nounwind {
;
; AVX512-LABEL: trunc_and_const_v16i64_v16i16:
; AVX512: # BB#0:
-; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vandps {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: retq
%1 = and <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
@@ -3587,8 +3587,8 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
;
; AVX512-LABEL: trunc_and_const_v16i64_v16i8:
; AVX512: # BB#0:
-; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vandps {{.*}}(%rip), %zmm1, %zmm1
+; AVX512-NEXT: vandps {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm1, %ymm1
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -3812,7 +3812,7 @@ define <8 x i16> @trunc_xor_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
;
; AVX512-LABEL: trunc_xor_v8i64_8i16:
; AVX512: # BB#0:
-; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vxorps %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: retq
%1 = xor <8 x i64> %a0, %a1
@@ -3951,8 +3951,8 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
;
; AVX512-LABEL: trunc_xor_v16i64_v16i8:
; AVX512: # BB#0:
-; AVX512-NEXT: vpxorq %zmm3, %zmm1, %zmm1
-; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT: vxorps %zmm3, %zmm1, %zmm1
+; AVX512-NEXT: vxorps %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm1, %ymm1
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -4183,7 +4183,7 @@ define <8 x i16> @trunc_xor_const_v16i64_v16i16(<8 x i64> %a0) nounwind {
;
; AVX512-LABEL: trunc_xor_const_v16i64_v16i16:
; AVX512: # BB#0:
-; AVX512-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vxorps {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: retq
%1 = xor <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
@@ -4325,8 +4325,8 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
;
; AVX512-LABEL: trunc_xor_const_v16i64_v16i8:
; AVX512: # BB#0:
-; AVX512-NEXT: vpxorq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vxorps {{.*}}(%rip), %zmm1, %zmm1
+; AVX512-NEXT: vxorps {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm1, %ymm1
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -4550,7 +4550,7 @@ define <8 x i16> @trunc_or_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
;
; AVX512-LABEL: trunc_or_v8i64_8i16:
; AVX512: # BB#0:
-; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vorps %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: retq
%1 = or <8 x i64> %a0, %a1
@@ -4689,8 +4689,8 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind
;
; AVX512-LABEL: trunc_or_v16i64_v16i8:
; AVX512: # BB#0:
-; AVX512-NEXT: vporq %zmm3, %zmm1, %zmm1
-; AVX512-NEXT: vporq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT: vorps %zmm3, %zmm1, %zmm1
+; AVX512-NEXT: vorps %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm1, %ymm1
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -4921,7 +4921,7 @@ define <8 x i16> @trunc_or_const_v16i64_v16i16(<8 x i64> %a0) nounwind {
;
; AVX512-LABEL: trunc_or_const_v16i64_v16i16:
; AVX512: # BB#0:
-; AVX512-NEXT: vporq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vorps {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: retq
%1 = or <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
@@ -5063,8 +5063,8 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
;
; AVX512-LABEL: trunc_or_const_v16i64_v16i8:
; AVX512: # BB#0:
-; AVX512-NEXT: vporq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512-NEXT: vporq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vorps {{.*}}(%rip), %zmm1, %zmm1
+; AVX512-NEXT: vorps {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm1, %ymm1
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
OpenPOWER on IntegriCloud