summaryrefslogtreecommitdiffstats
path: root/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll')
-rw-r--r--llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll113
1 files changed, 75 insertions, 38 deletions
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
index 513d6d56b6c..ca714c4fc35 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s
declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone
@@ -22,8 +22,8 @@ define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i3
; CHECK-NEXT: vpbroadcastd %edi, %zmm1
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1}
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpbroadcastd %edi, %zmm1 {%k1} {z}
+; CHECK-NEXT: vpbroadcastd %edi, %zmm2 {%k1} {z}
+; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 -1)
@@ -42,8 +42,8 @@ define <8 x i64>@test_int_x86_avx512_mask_pbroadcastq_gpr_512(i64 %x0, <8 x i64>
; CHECK-NEXT: vpbroadcastq %rdi, %zmm1
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1}
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpbroadcastq %rdi, %zmm1 {%k1} {z}
+; CHECK-NEXT: vpbroadcastq %rdi, %zmm2 {%k1} {z}
+; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 -1)
@@ -106,9 +106,9 @@ define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32>
; CHECK-NEXT: vpbroadcastd %xmm0, %zmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpbroadcastd %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1
; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1)
%res1 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask)
@@ -126,9 +126,9 @@ define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x
; CHECK-NEXT: vpbroadcastq %xmm0, %zmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpbroadcastq %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpaddq %zmm1, %zmm2, %zmm1
; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1)
%res1 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 %mask)
@@ -227,8 +227,8 @@ define <8 x i64>@test_int_x86_avx512_mask_perm_di_512(<8 x i64> %x0, i32 %x1, <8
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4]
; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4]
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3)
@@ -244,6 +244,7 @@ define void @test_store1(<16 x float> %data, i8* %ptr, i8* %ptr2, i16 %mask) {
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovups %zmm0, (%rdi) {%k1}
; CHECK-NEXT: vmovups %zmm0, (%rsi)
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr2, <16 x float> %data, i16 -1)
@@ -258,6 +259,7 @@ define void @test_store2(<8 x double> %data, i8* %ptr, i8* %ptr2, i8 %mask) {
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovupd %zmm0, (%rdi) {%k1}
; CHECK-NEXT: vmovupd %zmm0, (%rsi)
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr2, <8 x double> %data, i8 -1)
@@ -272,6 +274,7 @@ define void @test_mask_store_aligned_ps(<16 x float> %data, i8* %ptr, i8* %ptr2,
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovaps %zmm0, (%rdi) {%k1}
; CHECK-NEXT: vmovaps %zmm0, (%rsi)
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr2, <16 x float> %data, i16 -1)
@@ -286,6 +289,7 @@ define void @test_mask_store_aligned_pd(<8 x double> %data, i8* %ptr, i8* %ptr2,
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovapd %zmm0, (%rdi) {%k1}
; CHECK-NEXT: vmovapd %zmm0, (%rsi)
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.mask.store.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
call void @llvm.x86.avx512.mask.store.pd.512(i8* %ptr2, <8 x double> %data, i8 -1)
@@ -300,6 +304,7 @@ define void@test_int_x86_avx512_mask_storeu_q_512(i8* %ptr1, i8* %ptr2, <8 x i64
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovdqu64 %zmm0, (%rdi) {%k1}
; CHECK-NEXT: vmovdqu64 %zmm0, (%rsi)
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr1, <8 x i64> %x1, i8 %x2)
call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr2, <8 x i64> %x1, i8 -1)
@@ -314,6 +319,7 @@ define void@test_int_x86_avx512_mask_storeu_d_512(i8* %ptr1, i8* %ptr2, <16 x i3
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1}
; CHECK-NEXT: vmovdqu64 %zmm0, (%rsi)
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.mask.storeu.d.512(i8* %ptr1, <16 x i32> %x1, i16 %x2)
call void @llvm.x86.avx512.mask.storeu.d.512(i8* %ptr2, <16 x i32> %x1, i16 -1)
@@ -328,6 +334,7 @@ define void@test_int_x86_avx512_mask_store_q_512(i8* %ptr1, i8* %ptr2, <8 x i64>
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, (%rdi) {%k1}
; CHECK-NEXT: vmovdqa64 %zmm0, (%rsi)
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.mask.store.q.512(i8* %ptr1, <8 x i64> %x1, i8 %x2)
call void @llvm.x86.avx512.mask.store.q.512(i8* %ptr2, <8 x i64> %x1, i8 -1)
@@ -342,6 +349,7 @@ define void@test_int_x86_avx512_mask_store_d_512(i8* %ptr1, i8* %ptr2, <16 x i32
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovdqa32 %zmm0, (%rdi) {%k1}
; CHECK-NEXT: vmovdqa64 %zmm0, (%rsi)
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.mask.store.d.512(i8* %ptr1, <16 x i32> %x1, i16 %x2)
call void @llvm.x86.avx512.mask.store.d.512(i8* %ptr2, <16 x i32> %x1, i16 -1)
@@ -543,8 +551,8 @@ define <16 x i32>@test_int_x86_avx512_mask_pshuf_d_512(<16 x i32> %x0, i32 %x1,
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12]
; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12]
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3)
@@ -560,6 +568,7 @@ define i16 @test_pcmpeq_d(<16 x i32> %a, <16 x i32> %b) {
; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
ret i16 %res
@@ -572,6 +581,7 @@ define i16 @test_mask_pcmpeq_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: andl %edi, %eax
; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
ret i16 %res
@@ -585,6 +595,7 @@ define i8 @test_pcmpeq_q(<8 x i64> %a, <8 x i64> %b) {
; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: ## kill: def $al killed $al killed $eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
ret i8 %res
@@ -597,6 +608,7 @@ define i8 @test_mask_pcmpeq_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: andb %dil, %al
; CHECK-NEXT: ## kill: def $al killed $al killed $eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
ret i8 %res
@@ -610,6 +622,7 @@ define i16 @test_pcmpgt_d(<16 x i32> %a, <16 x i32> %b) {
; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
ret i16 %res
@@ -622,6 +635,7 @@ define i16 @test_mask_pcmpgt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: andl %edi, %eax
; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
ret i16 %res
@@ -635,6 +649,7 @@ define i8 @test_pcmpgt_q(<8 x i64> %a, <8 x i64> %b) {
; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: ## kill: def $al killed $al killed $eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
ret i8 %res
@@ -647,6 +662,7 @@ define i8 @test_mask_pcmpgt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: andb %dil, %al
; CHECK-NEXT: ## kill: def $al killed $al killed $eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
ret i8 %res
@@ -726,9 +742,9 @@ define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i6
; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm2
; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
@@ -984,6 +1000,7 @@ define void@test_storent_q_512(<8 x i64> %data, i8* %ptr) {
; CHECK-LABEL: test_storent_q_512:
; CHECK: ## %bb.0:
; CHECK-NEXT: vmovntps %zmm0, (%rdi)
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.storent.q.512(i8* %ptr, <8 x i64> %data)
ret void
@@ -995,6 +1012,7 @@ define void @test_storent_pd_512(<8 x double> %data, i8* %ptr) {
; CHECK-LABEL: test_storent_pd_512:
; CHECK: ## %bb.0:
; CHECK-NEXT: vmovntps %zmm0, (%rdi)
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.storent.pd.512(i8* %ptr, <8 x double> %data)
ret void
@@ -1006,6 +1024,7 @@ define void @test_storent_ps_512(<16 x float> %data, i8* %ptr) {
; CHECK-LABEL: test_storent_ps_512:
; CHECK: ## %bb.0:
; CHECK-NEXT: vmovntps %zmm0, (%rdi)
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.storent.ps.512(i8* %ptr, <16 x float> %data)
ret void
@@ -1955,8 +1974,8 @@ define <16 x i32>@test_int_x86_avx512_mask_pmovzxb_d_512(<16 x i8> %x0, <16 x i3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2)
@@ -1975,8 +1994,8 @@ define <8 x i64>@test_int_x86_avx512_mask_pmovzxb_q_512(<16 x i8> %x0, <8 x i64>
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovzxbq {{.*#+}} zmm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: vpmovzxbq {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2)
@@ -1995,8 +2014,8 @@ define <8 x i64>@test_int_x86_avx512_mask_pmovzxd_q_512(<8 x i32> %x0, <8 x i64>
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovzxdq {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
; CHECK-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2)
@@ -2015,8 +2034,8 @@ define <16 x i32>@test_int_x86_avx512_mask_pmovzxw_d_512(<16 x i16> %x0, <16 x i
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2)
@@ -2035,8 +2054,8 @@ define <8 x i64>@test_int_x86_avx512_mask_pmovzxw_q_512(<8 x i16> %x0, <8 x i64>
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovzxwq {{.*#+}} zmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; CHECK-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2)
@@ -2055,8 +2074,8 @@ define <16 x i32>@test_int_x86_avx512_mask_pmovsxb_d_512(<16 x i8> %x0, <16 x i3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovsxbd %xmm0, %zmm1 {%k1}
; CHECK-NEXT: vpmovsxbd %xmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2)
@@ -2075,8 +2094,8 @@ define <8 x i64>@test_int_x86_avx512_mask_pmovsxb_q_512(<16 x i8> %x0, <8 x i64>
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovsxbq %xmm0, %zmm1 {%k1}
; CHECK-NEXT: vpmovsxbq %xmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2)
@@ -2095,8 +2114,8 @@ define <8 x i64>@test_int_x86_avx512_mask_pmovsxd_q_512(<8 x i32> %x0, <8 x i64>
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovsxdq %ymm0, %zmm1 {%k1}
; CHECK-NEXT: vpmovsxdq %ymm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2)
@@ -2116,8 +2135,8 @@ define <16 x i32>@test_int_x86_avx512_mask_pmovsxw_d_512(<16 x i16> %x0, <16 x i
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovsxwd %ymm0, %zmm1 {%k1}
; CHECK-NEXT: vpmovsxwd %ymm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2)
@@ -2137,8 +2156,8 @@ define <8 x i64>@test_int_x86_avx512_mask_pmovsxw_q_512(<8 x i16> %x0, <8 x i64>
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovsxwq %xmm0, %zmm1 {%k1}
; CHECK-NEXT: vpmovsxwq %xmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2)
@@ -2156,8 +2175,8 @@ define <8 x i64>@test_int_x86_avx512_mask_psrl_qi_512(<8 x i64> %x0, i32 %x1, <8
; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm2
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 4, <8 x i64> %x2, i8 %x3)
@@ -2176,8 +2195,8 @@ define <16 x i32>@test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i32 %x1,
; CHECK-NEXT: vpsrld $4, %zmm0, %zmm2
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsrld $4, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
; CHECK-NEXT: vpsrld $4, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 4, <16 x i32> %x2, i16 %x3)
@@ -2197,8 +2216,8 @@ define <16 x i32>@test_int_x86_avx512_mask_psra_di_512(<16 x i32> %x0, i32 %x1,
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsrad $3, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vpsrad $3, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3)
@@ -2217,8 +2236,8 @@ define <8 x i64>@test_int_x86_avx512_mask_psra_qi_512(<8 x i64> %x0, i32 %x1, <8
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsraq $3, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vpsraq $3, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3)
@@ -2237,8 +2256,8 @@ define <16 x i32>@test_int_x86_avx512_mask_psll_di_512(<16 x i32> %x0, i32 %x1,
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpslld $3, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vpslld $3, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3)
@@ -2257,8 +2276,8 @@ define <8 x i64>@test_int_x86_avx512_mask_psll_qi_512(<8 x i64> %x0, i32 %x1, <8
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsllq $3, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vpsllq $3, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3)
@@ -3007,6 +3026,7 @@ define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vextractf32x4 $2, %zmm1, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float> %a, i32 2, <4 x float> %b, i8 %mask)
ret <4 x float> %res
@@ -3031,6 +3051,7 @@ define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) {
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32> %a, i32 2, <4 x i32> zeroinitializer, i8 %mask)
ret <4 x i32> %res
@@ -3077,9 +3098,9 @@ define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x
; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm2
; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1)
@@ -3117,9 +3138,9 @@ define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i6
; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm2
; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1)
@@ -3164,6 +3185,7 @@ define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
@@ -3208,6 +3230,7 @@ define <8 x i16> @test_mask_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask)
; CHECK-NEXT: kmovw %k1, %eax
; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
; CHECK-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
@@ -3254,6 +3277,7 @@ define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
@@ -3298,6 +3322,7 @@ define <8 x i16> @test_mask_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask
; CHECK-NEXT: kmovw %k1, %eax
; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
; CHECK-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
@@ -3344,6 +3369,7 @@ define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
; CHECK-NEXT: movl $255, %eax
; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
@@ -3388,6 +3414,7 @@ define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
; CHECK-NEXT: kmovw %k1, %eax
; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
; CHECK-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
@@ -3434,6 +3461,7 @@ define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
; CHECK-NEXT: movl $255, %eax
; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
@@ -3478,6 +3506,7 @@ define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
; CHECK-NEXT: kmovw %k1, %eax
; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
; CHECK-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
@@ -3579,8 +3608,8 @@ define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x4_512(<4 x i32> %x0, <16
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 -1)
@@ -3612,9 +3641,9 @@ define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512(<4 x i64> %x0, <8 x
; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpaddq %zmm1, %zmm2, %zmm1
; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%res1 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 -1)
@@ -3677,6 +3706,7 @@ define i8 @test_vptestmq(<8 x i64> %a0, <8 x i64> %a1, i8 %m) {
; CHECK-NEXT: andb %al, %dil
; CHECK-NEXT: addb %dil, %al
; CHECK-NEXT: ## kill: def $al killed $al killed $eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 -1)
%res1 = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 %m)
@@ -3693,6 +3723,7 @@ define i16 @test_vptestmd(<16 x i32> %a0, <16 x i32> %a1, i16 %m) {
; CHECK-NEXT: andl %eax, %edi
; CHECK-NEXT: addl %edi, %eax
; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 -1)
%res1 = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 %m)
@@ -3711,6 +3742,7 @@ define i16@test_int_x86_avx512_ptestnm_d_512(<16 x i32> %x0, <16 x i32> %x1, i16
; CHECK-NEXT: andl %eax, %edi
; CHECK-NEXT: addl %edi, %eax
; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2)
%res1 = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16-1)
@@ -3728,6 +3760,7 @@ define i8@test_int_x86_avx512_ptestnm_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2
; CHECK-NEXT: andb %al, %dil
; CHECK-NEXT: addb %dil, %al
; CHECK-NEXT: ## kill: def $al killed $al killed $eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2)
%res1 = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8-1)
@@ -3822,6 +3855,7 @@ define i32 @test_kortestz(<8 x i64> %A, <8 x i64> %B, <8 x i64> %C, <8 x i64> %D
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: kortestw %k1, %k0
; CHECK-NEXT: sete %al
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
entry:
%0 = bitcast <8 x i64> %A to <16 x i32>
@@ -3845,6 +3879,7 @@ define i32 @test_kortestc(<8 x i64> %A, <8 x i64> %B, <8 x i64> %C, <8 x i64> %D
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: kortestw %k1, %k0
; CHECK-NEXT: sete %al
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
entry:
%0 = bitcast <8 x i64> %A to <16 x i32>
@@ -3865,6 +3900,7 @@ define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) {
; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8)
ret i16 %res
@@ -3877,6 +3913,7 @@ define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) {
; CHECK-NEXT: vcmpneqpd %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: ## kill: def $al killed $al killed $eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i8 -1, i32 4)
ret i8 %res
@@ -4181,8 +4218,8 @@ define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm2 {%k1}
; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; CHECK-NEXT: vpaddq %zmm3, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
@@ -4221,8 +4258,8 @@ define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm2 {%k1}
; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: vpaddd %zmm3, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3)
OpenPOWER on IntegriCloud