diff options
-rw-r--r-- | llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll | 113 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx512-intrinsics.ll | 78 |
2 files changed, 127 insertions, 64 deletions
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index 513d6d56b6c..ca714c4fc35 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone @@ -22,8 +22,8 @@ define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i3 ; CHECK-NEXT: vpbroadcastd %edi, %zmm1 ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1} -; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: vpbroadcastd %edi, %zmm1 {%k1} {z} +; CHECK-NEXT: vpbroadcastd %edi, %zmm2 {%k1} {z} +; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 -1) @@ -42,8 +42,8 @@ define <8 x i64>@test_int_x86_avx512_mask_pbroadcastq_gpr_512(i64 %x0, <8 x i64> ; CHECK-NEXT: vpbroadcastq %rdi, %zmm1 ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1} -; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: vpbroadcastq %rdi, %zmm1 {%k1} {z} +; CHECK-NEXT: vpbroadcastq %rdi, %zmm2 {%k1} {z} +; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 -1) @@ -106,9 +106,9 @@ define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32> ; CHECK-NEXT: vpbroadcastd %xmm0, %zmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpbroadcastd %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1 ; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1) %res1 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask) @@ -126,9 +126,9 @@ define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x ; CHECK-NEXT: vpbroadcastq %xmm0, %zmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpbroadcastq %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vpaddq %zmm1, %zmm2, %zmm1 ; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1) %res1 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 %mask) @@ -227,8 +227,8 @@ define <8 x i64>@test_int_x86_avx512_mask_perm_di_512(<8 x i64> %x0, i32 %x1, <8 ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4] ; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4] -; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3) %res1 = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3) @@ -244,6 +244,7 @@ define void @test_store1(<16 x float> %data, i8* %ptr, i8* %ptr2, i16 %mask) { ; CHECK-NEXT: kmovw %edx, %k1 ; CHECK-NEXT: vmovups %zmm0, (%rdi) {%k1} ; CHECK-NEXT: vmovups %zmm0, (%rsi) +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr, <16 x float> %data, i16 %mask) call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr2, <16 x float> %data, i16 -1) @@ -258,6 +259,7 @@ define void @test_store2(<8 x double> %data, i8* %ptr, i8* %ptr2, i8 %mask) { ; CHECK-NEXT: kmovw %edx, %k1 ; CHECK-NEXT: vmovupd %zmm0, (%rdi) {%k1} ; CHECK-NEXT: vmovupd %zmm0, (%rsi) +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr, <8 x double> %data, i8 %mask) call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr2, <8 x double> %data, i8 -1) @@ -272,6 +274,7 @@ define void @test_mask_store_aligned_ps(<16 x float> %data, i8* %ptr, i8* %ptr2, ; CHECK-NEXT: kmovw %edx, %k1 ; CHECK-NEXT: vmovaps %zmm0, (%rdi) {%k1} ; CHECK-NEXT: vmovaps %zmm0, (%rsi) +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr, <16 x float> %data, i16 %mask) call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr2, <16 x float> %data, i16 -1) @@ -286,6 +289,7 @@ define void @test_mask_store_aligned_pd(<8 x double> %data, i8* %ptr, i8* %ptr2, ; CHECK-NEXT: kmovw %edx, %k1 ; CHECK-NEXT: vmovapd %zmm0, (%rdi) {%k1} ; CHECK-NEXT: vmovapd %zmm0, (%rsi) +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.mask.store.pd.512(i8* %ptr, <8 x double> %data, i8 %mask) call void @llvm.x86.avx512.mask.store.pd.512(i8* %ptr2, <8 x double> %data, i8 -1) @@ -300,6 +304,7 @@ define void@test_int_x86_avx512_mask_storeu_q_512(i8* %ptr1, i8* %ptr2, <8 x i64 ; CHECK-NEXT: kmovw %edx, %k1 ; CHECK-NEXT: vmovdqu64 %zmm0, (%rdi) {%k1} ; CHECK-NEXT: vmovdqu64 %zmm0, (%rsi) +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr1, <8 x i64> %x1, i8 %x2) call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr2, <8 x i64> %x1, i8 -1) @@ -314,6 +319,7 @@ define void@test_int_x86_avx512_mask_storeu_d_512(i8* %ptr1, i8* %ptr2, <16 x i3 ; CHECK-NEXT: kmovw %edx, %k1 ; CHECK-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} ; CHECK-NEXT: vmovdqu64 %zmm0, (%rsi) +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.mask.storeu.d.512(i8* %ptr1, <16 x i32> %x1, i16 %x2) call void @llvm.x86.avx512.mask.storeu.d.512(i8* %ptr2, <16 x i32> %x1, i16 -1) @@ -328,6 +334,7 @@ define void@test_int_x86_avx512_mask_store_q_512(i8* %ptr1, i8* %ptr2, <8 x i64> ; CHECK-NEXT: kmovw %edx, %k1 ; CHECK-NEXT: vmovdqa64 %zmm0, (%rdi) {%k1} ; CHECK-NEXT: vmovdqa64 %zmm0, (%rsi) +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.mask.store.q.512(i8* %ptr1, <8 x i64> %x1, i8 %x2) call void @llvm.x86.avx512.mask.store.q.512(i8* %ptr2, <8 x i64> %x1, i8 -1) @@ -342,6 +349,7 @@ define void@test_int_x86_avx512_mask_store_d_512(i8* %ptr1, i8* %ptr2, <16 x i32 ; CHECK-NEXT: kmovw %edx, %k1 ; CHECK-NEXT: vmovdqa32 %zmm0, (%rdi) {%k1} ; CHECK-NEXT: vmovdqa64 %zmm0, (%rsi) +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.mask.store.d.512(i8* %ptr1, <16 x i32> %x1, i16 %x2) call void @llvm.x86.avx512.mask.store.d.512(i8* %ptr2, <16 x i32> %x1, i16 -1) @@ -543,8 +551,8 @@ define <16 x i32>@test_int_x86_avx512_mask_pshuf_d_512(<16 x i32> %x0, i32 %x1, ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12] ; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12] -; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3) %res1 = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3) @@ -560,6 +568,7 @@ define i16 @test_pcmpeq_d(<16 x i32> %a, <16 x i32> %b) { ; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1) ret i16 %res @@ -572,6 +581,7 @@ define i16 @test_mask_pcmpeq_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) { ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: andl %edi, %eax ; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask) ret i16 %res @@ -585,6 +595,7 @@ define i8 @test_pcmpeq_q(<8 x i64> %a, <8 x i64> %b) { ; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: ## kill: def $al killed $al killed $eax +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1) ret i8 %res @@ -597,6 +608,7 @@ define i8 @test_mask_pcmpeq_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) { ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: andb %dil, %al ; CHECK-NEXT: ## kill: def $al killed $al killed $eax +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask) ret i8 %res @@ -610,6 +622,7 @@ define i16 @test_pcmpgt_d(<16 x i32> %a, <16 x i32> %b) { ; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1) ret i16 %res @@ -622,6 +635,7 @@ define i16 @test_mask_pcmpgt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) { ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: andl %edi, %eax ; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask) ret i16 %res @@ -635,6 +649,7 @@ define i8 @test_pcmpgt_q(<8 x i64> %a, <8 x i64> %b) { ; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: ## kill: def $al killed $al killed $eax +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1) ret i8 %res @@ -647,6 +662,7 @@ define i8 @test_mask_pcmpgt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) { ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: andb %dil, %al ; CHECK-NEXT: ## kill: def $al killed $al killed $eax +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask) ret i8 %res @@ -726,9 +742,9 @@ define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i6 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) @@ -984,6 +1000,7 @@ define void@test_storent_q_512(<8 x i64> %data, i8* %ptr) { ; CHECK-LABEL: test_storent_q_512: ; CHECK: ## %bb.0: ; CHECK-NEXT: vmovntps %zmm0, (%rdi) +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.storent.q.512(i8* %ptr, <8 x i64> %data) ret void @@ -995,6 +1012,7 @@ define void @test_storent_pd_512(<8 x double> %data, i8* %ptr) { ; CHECK-LABEL: test_storent_pd_512: ; CHECK: ## %bb.0: ; CHECK-NEXT: vmovntps %zmm0, (%rdi) +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.storent.pd.512(i8* %ptr, <8 x double> %data) ret void @@ -1006,6 +1024,7 @@ define void @test_storent_ps_512(<16 x float> %data, i8* %ptr) { ; CHECK-LABEL: test_storent_ps_512: ; CHECK: ## %bb.0: ; CHECK-NEXT: vmovntps %zmm0, (%rdi) +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.storent.ps.512(i8* %ptr, <16 x float> %data) ret void @@ -1955,8 +1974,8 @@ define <16 x i32>@test_int_x86_avx512_mask_pmovzxb_d_512(<16 x i8> %x0, <16 x i3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2) @@ -1975,8 +1994,8 @@ define <8 x i64>@test_int_x86_avx512_mask_pmovzxb_q_512(<16 x i8> %x0, <8 x i64> ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovzxbq {{.*#+}} zmm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vpmovzxbq {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2) @@ -1995,8 +2014,8 @@ define <8 x i64>@test_int_x86_avx512_mask_pmovzxd_q_512(<8 x i32> %x0, <8 x i64> ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovzxdq {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero ; CHECK-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2) @@ -2015,8 +2034,8 @@ define <16 x i32>@test_int_x86_avx512_mask_pmovzxw_d_512(<16 x i16> %x0, <16 x i ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2) @@ -2035,8 +2054,8 @@ define <8 x i64>@test_int_x86_avx512_mask_pmovzxw_q_512(<8 x i16> %x0, <8 x i64> ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovzxwq {{.*#+}} zmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; CHECK-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2) @@ -2055,8 +2074,8 @@ define <16 x i32>@test_int_x86_avx512_mask_pmovsxb_d_512(<16 x i8> %x0, <16 x i3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovsxbd %xmm0, %zmm1 {%k1} ; CHECK-NEXT: vpmovsxbd %xmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2) @@ -2075,8 +2094,8 @@ define <8 x i64>@test_int_x86_avx512_mask_pmovsxb_q_512(<16 x i8> %x0, <8 x i64> ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovsxbq %xmm0, %zmm1 {%k1} ; CHECK-NEXT: vpmovsxbq %xmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2) @@ -2095,8 +2114,8 @@ define <8 x i64>@test_int_x86_avx512_mask_pmovsxd_q_512(<8 x i32> %x0, <8 x i64> ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovsxdq %ymm0, %zmm1 {%k1} ; CHECK-NEXT: vpmovsxdq %ymm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2) @@ -2116,8 +2135,8 @@ define <16 x i32>@test_int_x86_avx512_mask_pmovsxw_d_512(<16 x i16> %x0, <16 x i ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovsxwd %ymm0, %zmm1 {%k1} ; CHECK-NEXT: vpmovsxwd %ymm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2) @@ -2137,8 +2156,8 @@ define <8 x i64>@test_int_x86_avx512_mask_pmovsxw_q_512(<8 x i16> %x0, <8 x i64> ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovsxwq %xmm0, %zmm1 {%k1} ; CHECK-NEXT: vpmovsxwq %xmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2) @@ -2156,8 +2175,8 @@ define <8 x i64>@test_int_x86_avx512_mask_psrl_qi_512(<8 x i64> %x0, i32 %x1, <8 ; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm2 ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 ; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 4, <8 x i64> %x2, i8 %x3) @@ -2176,8 +2195,8 @@ define <16 x i32>@test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i32 %x1, ; CHECK-NEXT: vpsrld $4, %zmm0, %zmm2 ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsrld $4, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; CHECK-NEXT: vpsrld $4, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 4, <16 x i32> %x2, i16 %x3) @@ -2197,8 +2216,8 @@ define <16 x i32>@test_int_x86_avx512_mask_psra_di_512(<16 x i32> %x0, i32 %x1, ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsrad $3, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vpsrad $3, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3) %res1 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3) @@ -2217,8 +2236,8 @@ define <8 x i64>@test_int_x86_avx512_mask_psra_qi_512(<8 x i64> %x0, i32 %x1, <8 ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsraq $3, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vpsraq $3, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3) %res1 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3) @@ -2237,8 +2256,8 @@ define <16 x i32>@test_int_x86_avx512_mask_psll_di_512(<16 x i32> %x0, i32 %x1, ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpslld $3, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vpslld $3, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3) %res1 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3) @@ -2257,8 +2276,8 @@ define <8 x i64>@test_int_x86_avx512_mask_psll_qi_512(<8 x i64> %x0, i32 %x1, <8 ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsllq $3, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vpsllq $3, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3) %res1 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3) @@ -3007,6 +3026,7 @@ define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextractf32x4 $2, %zmm1, %xmm0 {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float> %a, i32 2, <4 x float> %b, i8 %mask) ret <4 x float> %res @@ -3031,6 +3051,7 @@ define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) { ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32> %a, i32 2, <4 x i32> zeroinitializer, i8 %mask) ret <4 x i32> %res @@ -3077,9 +3098,9 @@ define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4) %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1) @@ -3117,9 +3138,9 @@ define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i6 ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4) %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1) @@ -3164,6 +3185,7 @@ define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1) %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 @@ -3208,6 +3230,7 @@ define <8 x i16> @test_mask_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) ; CHECK-NEXT: kmovw %k1, %eax ; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ; CHECK-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask) %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 @@ -3254,6 +3277,7 @@ define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1) %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 @@ -3298,6 +3322,7 @@ define <8 x i16> @test_mask_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask ; CHECK-NEXT: kmovw %k1, %eax ; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ; CHECK-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask) %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 @@ -3344,6 +3369,7 @@ define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ; CHECK-NEXT: movl $255, %eax ; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 @@ -3388,6 +3414,7 @@ define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-NEXT: kmovw %k1, %eax ; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ; CHECK-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 @@ -3434,6 +3461,7 @@ define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ; CHECK-NEXT: movl $255, %eax ; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 @@ -3478,6 +3506,7 @@ define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-NEXT: kmovw %k1, %eax ; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ; CHECK-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 @@ -3579,8 +3608,8 @@ define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x4_512(<4 x i32> %x0, <16 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} {z} +; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: retq %res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 -1) @@ -3612,9 +3641,9 @@ define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512(<4 x i64> %x0, <8 x ; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vpaddq %zmm1, %zmm2, %zmm1 ; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: retq %res1 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 -1) @@ -3677,6 +3706,7 @@ define i8 @test_vptestmq(<8 x i64> %a0, <8 x i64> %a1, i8 %m) { ; CHECK-NEXT: andb %al, %dil ; CHECK-NEXT: addb %dil, %al ; CHECK-NEXT: ## kill: def $al killed $al killed $eax +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 -1) %res1 = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 %m) @@ -3693,6 +3723,7 @@ define i16 @test_vptestmd(<16 x i32> %a0, <16 x i32> %a1, i16 %m) { ; CHECK-NEXT: andl %eax, %edi ; CHECK-NEXT: addl %edi, %eax ; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 -1) %res1 = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 %m) @@ -3711,6 +3742,7 @@ define i16@test_int_x86_avx512_ptestnm_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 ; CHECK-NEXT: andl %eax, %edi ; CHECK-NEXT: addl %edi, %eax ; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) %res1 = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16-1) @@ -3728,6 +3760,7 @@ define i8@test_int_x86_avx512_ptestnm_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2 ; CHECK-NEXT: andb %al, %dil ; CHECK-NEXT: addb %dil, %al ; CHECK-NEXT: ## kill: def $al killed $al killed $eax +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) %res1 = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8-1) @@ -3822,6 +3855,7 @@ define i32 @test_kortestz(<8 x i64> %A, <8 x i64> %B, <8 x i64> %C, <8 x i64> %D ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: kortestw %k1, %k0 ; CHECK-NEXT: sete %al +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: %0 = bitcast <8 x i64> %A to <16 x i32> @@ -3845,6 +3879,7 @@ define i32 @test_kortestc(<8 x i64> %A, <8 x i64> %B, <8 x i64> %C, <8 x i64> %D ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: kortestw %k1, %k0 ; CHECK-NEXT: sete %al +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: %0 = bitcast <8 x i64> %A to <16 x i32> @@ -3865,6 +3900,7 @@ define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) { ; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8) ret i16 %res @@ -3877,6 +3913,7 @@ define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) { ; CHECK-NEXT: vcmpneqpd %zmm1, %zmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: ## kill: def $al killed $al killed $eax +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i8 -1, i32 4) ret i8 %res @@ -4181,8 +4218,8 @@ define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm2 {%k1} ; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: vpaddq %zmm3, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) %res1 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3) @@ -4221,8 +4258,8 @@ define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm2 {%k1} ; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: vpaddd %zmm3, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) %res1 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3) diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index 52e67ed6e98..4ee9b2941bb 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s define <16 x float> @test_rcp_ps_512(<16 x float> %a0) { ; CHECK-LABEL: test_rcp_ps_512: @@ -651,6 +651,7 @@ define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) { ; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i32 8) %1 = bitcast <16 x i1> %res to i16 @@ -664,6 +665,7 @@ define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) { ; CHECK-NEXT: vcmpneqpd %zmm1, %zmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: ## kill: def $al killed $al killed $eax +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i32 4) %1 = bitcast <8 x i1> %res to i8 @@ -2164,8 +2166,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_512(<8 x i64> %x0, <16 x i8> % ; CHECK-NEXT: vpmovqb %zmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpmovqb %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vpmovqb %zmm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1) %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) @@ -2183,6 +2186,7 @@ define void @test_int_x86_avx512_mask_pmov_qb_mem_512(i8* %ptr, <8 x i64> %x1, i ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpmovqb %zmm0, (%rdi) ; CHECK-NEXT: vpmovqb %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) @@ -2198,8 +2202,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_512(<8 x i64> %x0, <16 x i8> ; CHECK-NEXT: vpmovsqb %zmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpmovsqb %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vpmovsqb %zmm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1) %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) @@ -2217,6 +2222,7 @@ define void @test_int_x86_avx512_mask_pmovs_qb_mem_512(i8* %ptr, <8 x i64> %x1, ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpmovsqb %zmm0, (%rdi) ; CHECK-NEXT: vpmovsqb %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) @@ -2232,8 +2238,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_512(<8 x i64> %x0, <16 x i8> ; CHECK-NEXT: vpmovusqb %zmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpmovusqb %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vpmovusqb %zmm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1) %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) @@ -2251,6 +2258,7 @@ define void @test_int_x86_avx512_mask_pmovus_qb_mem_512(i8* %ptr, <8 x i64> %x1, ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpmovusqb %zmm0, (%rdi) ; CHECK-NEXT: vpmovusqb %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) @@ -2266,8 +2274,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_512(<8 x i64> %x0, <8 x i16> % ; CHECK-NEXT: vpmovqw %zmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpmovqw %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vpmovqw %zmm0, %xmm0 +; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1) %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) @@ -2285,6 +2294,7 @@ define void @test_int_x86_avx512_mask_pmov_qw_mem_512(i8* %ptr, <8 x i64> %x1, i ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpmovqw %zmm0, (%rdi) ; CHECK-NEXT: vpmovqw %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) @@ -2300,8 +2310,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_512(<8 x i64> %x0, <8 x i16> ; CHECK-NEXT: vpmovsqw %zmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpmovsqw %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vpmovsqw %zmm0, %xmm0 +; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1) %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) @@ -2319,6 +2330,7 @@ define void @test_int_x86_avx512_mask_pmovs_qw_mem_512(i8* %ptr, <8 x i64> %x1, ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpmovsqw %zmm0, (%rdi) ; CHECK-NEXT: vpmovsqw %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) @@ -2334,8 +2346,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_512(<8 x i64> %x0, <8 x i16> ; CHECK-NEXT: vpmovusqw %zmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpmovusqw %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vpmovusqw %zmm0, %xmm0 +; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1) %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) @@ -2353,6 +2366,7 @@ define void @test_int_x86_avx512_mask_pmovus_qw_mem_512(i8* %ptr, <8 x i64> %x1, ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpmovusqw %zmm0, (%rdi) ; CHECK-NEXT: vpmovusqw %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) @@ -2367,9 +2381,9 @@ define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> % ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovqd %zmm0, %ymm2 {%k1} {z} ; CHECK-NEXT: vpmovqd %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; CHECK-NEXT: vpmovqd %zmm0, %ymm0 ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: retq %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) @@ -2387,6 +2401,7 @@ define void @test_int_x86_avx512_mask_pmov_qd_mem_512(i8* %ptr, <8 x i64> %x1, i ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpmovqd %zmm0, (%rdi) ; CHECK-NEXT: vpmovqd %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) @@ -2401,9 +2416,9 @@ define <8 x i32>@test_int_x86_avx512_mask_pmovs_qd_512(<8 x i64> %x0, <8 x i32> ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovsqd %zmm0, %ymm2 {%k1} {z} ; CHECK-NEXT: vpmovsqd %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; CHECK-NEXT: vpmovsqd %zmm0, %ymm0 ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: retq %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) @@ -2421,6 +2436,7 @@ define void @test_int_x86_avx512_mask_pmovs_qd_mem_512(i8* %ptr, <8 x i64> %x1, ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpmovsqd %zmm0, (%rdi) ; CHECK-NEXT: vpmovsqd %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) @@ -2435,9 +2451,9 @@ define <8 x i32>@test_int_x86_avx512_mask_pmovus_qd_512(<8 x i64> %x0, <8 x i32> ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovusqd %zmm0, %ymm2 {%k1} {z} ; CHECK-NEXT: vpmovusqd %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; CHECK-NEXT: vpmovusqd %zmm0, %ymm0 ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: retq %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) @@ -2455,6 +2471,7 @@ define void @test_int_x86_avx512_mask_pmovus_qd_mem_512(i8* %ptr, <8 x i64> %x1, ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpmovusqd %zmm0, (%rdi) ; CHECK-NEXT: vpmovusqd %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) @@ -2470,8 +2487,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_db_512(<16 x i32> %x0, <16 x i8> ; CHECK-NEXT: vpmovdb %zmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpmovdb %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vpmovdb %zmm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1) %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) @@ -2489,6 +2507,7 @@ define void @test_int_x86_avx512_mask_pmov_db_mem_512(i8* %ptr, <16 x i32> %x1, ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpmovdb %zmm0, (%rdi) ; CHECK-NEXT: vpmovdb %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1) call void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2) @@ -2504,8 +2523,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_512(<16 x i32> %x0, <16 x i8> ; CHECK-NEXT: vpmovsdb %zmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpmovsdb %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vpmovsdb %zmm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1) %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) @@ -2523,6 +2543,7 @@ define void @test_int_x86_avx512_mask_pmovs_db_mem_512(i8* %ptr, <16 x i32> %x1, ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpmovsdb %zmm0, (%rdi) ; CHECK-NEXT: vpmovsdb %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1) call void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2) @@ -2538,8 +2559,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_512(<16 x i32> %x0, <16 x i8 ; CHECK-NEXT: vpmovusdb %zmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vpmovusdb %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vpmovusdb %zmm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1) %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) @@ -2557,6 +2579,7 @@ define void @test_int_x86_avx512_mask_pmovus_db_mem_512(i8* %ptr, <16 x i32> %x1 ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpmovusdb %zmm0, (%rdi) ; CHECK-NEXT: vpmovusdb %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1) call void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2) @@ -2571,9 +2594,9 @@ define <16 x i16>@test_int_x86_avx512_mask_pmov_dw_512(<16 x i32> %x0, <16 x i16 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovdw %zmm0, %ymm2 {%k1} {z} ; CHECK-NEXT: vpmovdw %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ; CHECK-NEXT: vpmovdw %zmm0, %ymm0 ; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: retq %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1) %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) @@ -2591,6 +2614,7 @@ define void @test_int_x86_avx512_mask_pmov_dw_mem_512(i8* %ptr, <16 x i32> %x1, ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpmovdw %zmm0, (%rdi) ; CHECK-NEXT: vpmovdw %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1) call void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2) @@ -2605,9 +2629,9 @@ define <16 x i16>@test_int_x86_avx512_mask_pmovs_dw_512(<16 x i32> %x0, <16 x i1 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovsdw %zmm0, %ymm2 {%k1} {z} ; CHECK-NEXT: vpmovsdw %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ; CHECK-NEXT: vpmovsdw %zmm0, %ymm0 ; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: retq %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1) %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) @@ -2625,6 +2649,7 @@ define void @test_int_x86_avx512_mask_pmovs_dw_mem_512(i8* %ptr, <16 x i32> %x1, ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpmovsdw %zmm0, (%rdi) ; CHECK-NEXT: vpmovsdw %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1) call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2) @@ -2639,9 +2664,9 @@ define <16 x i16>@test_int_x86_avx512_mask_pmovus_dw_512(<16 x i32> %x0, <16 x i ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmovusdw %zmm0, %ymm2 {%k1} {z} ; CHECK-NEXT: vpmovusdw %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ; CHECK-NEXT: vpmovusdw %zmm0, %ymm0 ; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: retq %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1) %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) @@ -2659,6 +2684,7 @@ define void @test_int_x86_avx512_mask_pmovus_dw_mem_512(i8* %ptr, <16 x i32> %x1 ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpmovusdw %zmm0, (%rdi) ; CHECK-NEXT: vpmovusdw %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1) call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2) @@ -2936,9 +2962,9 @@ define i8@test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1 ; CHECK-NEXT: kmovw %k0, %esi ; CHECK-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: orb %cl, %dl ; CHECK-NEXT: orb %sil, %al ; CHECK-NEXT: orb %dl, %al +; CHECK-NEXT: orb %cl, %al ; CHECK-NEXT: ## kill: def $al killed $al killed $eax ; CHECK-NEXT: retq @@ -2981,9 +3007,9 @@ define i8@test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1, ; CHECK-NEXT: kmovw %k0, %esi ; CHECK-NEXT: vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: andb %cl, %dl ; CHECK-NEXT: andb %sil, %al ; CHECK-NEXT: andb %dl, %al +; CHECK-NEXT: andb %cl, %al ; CHECK-NEXT: ## kill: def $al killed $al killed $eax ; CHECK-NEXT: retq %res1 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 2, i8 -1, i32 4) @@ -3388,12 +3414,12 @@ declare <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32>, <16 x i32>, <16 define <16 x i32>@test_int_x86_avx512_mask_prorv_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_d_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm3 ; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: vpaddd %zmm3, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) %res1 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3) @@ -3408,12 +3434,12 @@ declare <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64>, <8 x i64>, <8 x i define <8 x i64>@test_int_x86_avx512_mask_prorv_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_q_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm3 ; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: vpaddq %zmm3, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) %res1 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3) @@ -3431,8 +3457,8 @@ define <16 x i32>@test_int_x86_avx512_mask_prol_d_512(<16 x i32> %x0, i32 %x1, < ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vprold $3, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vprold $3, %zmm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; CHECK-NEXT: vprold $3, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3) @@ -3451,8 +3477,8 @@ define <8 x i64>@test_int_x86_avx512_mask_prol_q_512(<8 x i64> %x0, i32 %x1, <8 ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vprolq $3, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vprolq $3, %zmm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 ; CHECK-NEXT: vprolq $3, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3) @@ -3496,8 +3522,8 @@ define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm2 {%k1} ; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: vpaddq %zmm3, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: retq %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1) %2 = bitcast i8 %x3 to <8 x i1> @@ -3544,8 +3570,8 @@ define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm2 {%k1} ; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: vpaddd %zmm3, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: retq %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1) %2 = bitcast i16 %x3 to <16 x i1> |