diff options
Diffstat (limited to 'llvm/test/CodeGen/X86')
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512-ext.ll (renamed from llvm/test/CodeGen/X86/avx512-trunc-ext.ll) | 26 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512-intrinsics.ll | 390 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512-trunc.ll | 364 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512bw-intrinsics.ll | 78 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll | 156 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512vl-intrinsics.ll | 780 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/masked_memop.ll | 11 |
7 files changed, 1776 insertions, 29 deletions
diff --git a/llvm/test/CodeGen/X86/avx512-trunc-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll index f25458972e4..aa1dd4928c3 100644 --- a/llvm/test/CodeGen/X86/avx512-trunc-ext.ll +++ b/llvm/test/CodeGen/X86/avx512-ext.ll @@ -1,24 +1,7 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=SKX - -; KNL-LABEL: trunc_16x32_to_16x8 -; KNL: vpmovdb -; KNL: ret -define <16 x i8> @trunc_16x32_to_16x8(<16 x i32> %i) nounwind readnone { - %x = trunc <16 x i32> %i to <16 x i8> - ret <16 x i8> %x -} - -; KNL-LABEL: trunc_8x64_to_8x16 -; KNL: vpmovqw -; KNL: ret -define <8 x i16> @trunc_8x64_to_8x16(<8 x i64> %i) nounwind readnone { - %x = trunc <8 x i64> %i to <8 x i16> - ret <8 x i16> %x -} - -;SKX-LABEL: zext_8x8mem_to_8x16: + ;SKX-LABEL: zext_8x8mem_to_8x16: ;SKX: ## BB#0: ;SKX-NEXT: vpmovw2m %xmm0, %k1 ;SKX-NEXT: vpmovzxbw (%rdi), %xmm0 {%k1} {z} @@ -895,13 +878,6 @@ define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind { ret <8 x i32> %y } -; KNL-LABEL: trunc_v16i32_to_v16i16 -; KNL: vpmovdw -; KNL: ret -define <16 x i16> @trunc_v16i32_to_v16i16(<16 x i32> %x) { - %1 = trunc <16 x i32> %x to <16 x i16> - ret <16 x i16> %1 -} ; KNL-LABEL: trunc_i32_to_i1 ; KNL: movw $-4, %ax diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index 6e50fda7467..7c30063ce28 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -3119,6 +3119,396 @@ define <16 x float>@test_int_x86_avx512_mask_scalef_ps_512(<16 x float> %x0, <16 ret <16 x float> %res2 } +declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_512: +; CHECK: vpmovqb %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovqb %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovqb %zmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_512: +; CHECK: vpmovqb %zmm0, (%rdi) +; CHECK: vpmovqb %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_512: +; CHECK: vpmovsqb %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsqb %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsqb %zmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_512: +; CHECK: vpmovsqb %zmm0, (%rdi) +; CHECK: vpmovsqb %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_512: +; CHECK: vpmovusqb %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusqb %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusqb %zmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_512: +; CHECK: vpmovusqb %zmm0, (%rdi) +; CHECK: vpmovusqb %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_512: +; CHECK: vpmovqw %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovqw %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovqw %zmm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_512: +; CHECK: vpmovqw %zmm0, (%rdi) +; CHECK: vpmovqw %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_512: +; CHECK: vpmovsqw %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsqw %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsqw %zmm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_512: +; CHECK: vpmovsqw %zmm0, (%rdi) +; CHECK: vpmovsqw %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_512: +; CHECK: vpmovusqw %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusqw %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusqw %zmm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_512: +; CHECK: vpmovusqw %zmm0, (%rdi) +; CHECK: vpmovusqw %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_512: +; CHECK: vpmovqd %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovqd %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovqd %zmm0, %ymm0 + %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) + %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2) + %res3 = add <8 x i32> %res0, %res1 + %res4 = add <8 x i32> %res3, %res2 + ret <8 x i32> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_512: +; CHECK: vpmovqd %zmm0, (%rdi) +; CHECK: vpmovqd %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_512: +; CHECK: vpmovsqd %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovsqd %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovsqd %zmm0, %ymm0 + %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) + %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2) + %res3 = add <8 x i32> %res0, %res1 + %res4 = add <8 x i32> %res3, %res2 + ret <8 x i32> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_512: +; CHECK: vpmovsqd %zmm0, (%rdi) +; CHECK: vpmovsqd %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_512: +; CHECK: vpmovusqd %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovusqd %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovusqd %zmm0, %ymm0 + %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) + %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2) + %res3 = add <8 x i32> %res0, %res1 + %res4 = add <8 x i32> %res3, %res2 + ret <8 x i32> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_512: +; CHECK: vpmovusqd %zmm0, (%rdi) +; CHECK: vpmovusqd %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_512: +; CHECK: vpmovdb %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovdb %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovdb %zmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32>, i16) + +define void @test_int_x86_avx512_mask_pmov_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_mem_512: +; CHECK: vpmovdb %zmm0, (%rdi) +; CHECK: vpmovdb %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_512: +; CHECK: vpmovsdb %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsdb %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsdb %zmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32>, i16) + +define void @test_int_x86_avx512_mask_pmovs_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_512: +; CHECK: vpmovsdb %zmm0, (%rdi) +; CHECK: vpmovsdb %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32>, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_512: +; CHECK: vpmovusdb %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusdb %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusdb %zmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32>, i16) + +define void @test_int_x86_avx512_mask_pmovus_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_512: +; CHECK: vpmovusdb %zmm0, (%rdi) +; CHECK: vpmovusdb %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2) + ret void +} + +declare <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32>, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_pmov_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_512: +; CHECK: vpmovdw %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovdw %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovdw %zmm0, %ymm0 + %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) + %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2) + %res3 = add <16 x i16> %res0, %res1 + %res4 = add <16 x i16> %res3, %res2 + ret <16 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32>, i16) + +define void @test_int_x86_avx512_mask_pmov_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_512: +; CHECK: vpmovdw %zmm0, (%rdi) +; CHECK: vpmovdw %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2) + ret void +} + +declare <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32>, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_pmovs_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_512: +; CHECK: vpmovsdw %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovsdw %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovsdw %zmm0, %ymm0 + %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) + %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2) + %res3 = add <16 x i16> %res0, %res1 + %res4 = add <16 x i16> %res3, %res2 + ret <16 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32>, i16) + +define void @test_int_x86_avx512_mask_pmovs_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_512: +; CHECK: vpmovsdw %zmm0, (%rdi) +; CHECK: vpmovsdw %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2) + ret void +} + +declare <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32>, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_pmovus_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_512: +; CHECK: vpmovusdw %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovusdw %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovusdw %zmm0, %ymm0 + %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) + %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2) + %res3 = add <16 x i16> %res0, %res1 + %res4 = add <16 x i16> %res3, %res2 + ret <16 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32>, i16) + +define void @test_int_x86_avx512_mask_pmovus_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_512: +; CHECK: vpmovusdw %zmm0, (%rdi) +; CHECK: vpmovusdw %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2) + ret void +} + declare <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32>, <8 x double>, i8) define <8 x double>@test_int_x86_avx512_mask_cvt_dq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) { diff --git a/llvm/test/CodeGen/X86/avx512-trunc.ll b/llvm/test/CodeGen/X86/avx512-trunc.ll new file mode 100644 index 00000000000..9205feda7eb --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512-trunc.ll @@ -0,0 +1,364 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=SKX + + attributes #0 = { nounwind } + +; KNL-LABEL: trunc_16x32_to_16x8 +; KNL: vpmovdb +; KNL: ret +define <16 x i8> @trunc_16x32_to_16x8(<16 x i32> %i) #0 { + %x = trunc <16 x i32> %i to <16 x i8> + ret <16 x i8> %x +} + +; KNL-LABEL: trunc_8x64_to_8x16 +; KNL: vpmovqw +; KNL: ret +define <8 x i16> @trunc_8x64_to_8x16(<8 x i64> %i) #0 { + %x = trunc <8 x i64> %i to <8 x i16> + ret <8 x i16> %x +} + +; KNL-LABEL: trunc_v16i32_to_v16i16 +; KNL: vpmovdw +; KNL: ret +define <16 x i16> @trunc_v16i32_to_v16i16(<16 x i32> %x) #0 { + %1 = trunc <16 x i32> %x to <16 x i16> + ret <16 x i16> %1 +} + +define <8 x i8> @trunc_qb_512(<8 x i64> %i) #0 { +; SKX-LABEL: trunc_qb_512: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqw %zmm0, %xmm0 +; SKX-NEXT: retq + %x = trunc <8 x i64> %i to <8 x i8> + ret <8 x i8> %x +} + +define void @trunc_qb_512_mem(<8 x i64> %i, <8 x i8>* %res) #0 { +; SKX-LABEL: trunc_qb_512_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqb %zmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <8 x i64> %i to <8 x i8> + store <8 x i8> %x, <8 x i8>* %res + ret void +} + +define <4 x i8> @trunc_qb_256(<4 x i64> %i) #0 { +; SKX-LABEL: trunc_qb_256: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqd %ymm0, %xmm0 +; SKX-NEXT: retq + %x = trunc <4 x i64> %i to <4 x i8> + ret <4 x i8> %x +} + +define void @trunc_qb_256_mem(<4 x i64> %i, <4 x i8>* %res) #0 { +; SKX-LABEL: trunc_qb_256_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqb %ymm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <4 x i64> %i to <4 x i8> + store <4 x i8> %x, <4 x i8>* %res + ret void +} + +define <2 x i8> @trunc_qb_128(<2 x i64> %i) #0 { +; SKX-LABEL: trunc_qb_128: +; SKX: ## BB#0: +; SKX-NEXT: retq + %x = trunc <2 x i64> %i to <2 x i8> + ret <2 x i8> %x +} + +define void @trunc_qb_128_mem(<2 x i64> %i, <2 x i8>* %res) #0 { +; SKX-LABEL: trunc_qb_128_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqb %xmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <2 x i64> %i to <2 x i8> + store <2 x i8> %x, <2 x i8>* %res + ret void +} + +define <8 x i16> @trunc_qw_512(<8 x i64> %i) #0 { +; SKX-LABEL: trunc_qw_512: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqw %zmm0, %xmm0 +; SKX-NEXT: retq + %x = trunc <8 x i64> %i to <8 x i16> + ret <8 x i16> %x +} + +define void @trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) #0 { +; SKX-LABEL: trunc_qw_512_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqw %zmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <8 x i64> %i to <8 x i16> + store <8 x i16> %x, <8 x i16>* %res + ret void +} + +define <4 x i16> @trunc_qw_256(<4 x i64> %i) #0 { +; SKX-LABEL: trunc_qw_256: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqd %ymm0, %xmm0 +; SKX-NEXT: retq + %x = trunc <4 x i64> %i to <4 x i16> + ret <4 x i16> %x +} + +define void @trunc_qw_256_mem(<4 x i64> %i, <4 x i16>* %res) #0 { +; SKX-LABEL: trunc_qw_256_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqw %ymm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <4 x i64> %i to <4 x i16> + store <4 x i16> %x, <4 x i16>* %res + ret void +} + +define <2 x i16> @trunc_qw_128(<2 x i64> %i) #0 { +; SKX-LABEL: trunc_qw_128: +; SKX: ## BB#0: +; SKX-NEXT: retq + %x = trunc <2 x i64> %i to <2 x i16> + ret <2 x i16> %x +} + +define void @trunc_qw_128_mem(<2 x i64> %i, <2 x i16>* %res) #0 { +; SKX-LABEL: trunc_qw_128_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqw %xmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <2 x i64> %i to <2 x i16> + store <2 x i16> %x, <2 x i16>* %res + ret void +} + +define <8 x i32> @trunc_qd_512(<8 x i64> %i) #0 { +; SKX-LABEL: trunc_qd_512: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqd %zmm0, %ymm0 +; SKX-NEXT: retq + %x = trunc <8 x i64> %i to <8 x i32> + ret <8 x i32> %x +} + +define void @trunc_qd_512_mem(<8 x i64> %i, <8 x i32>* %res) #0 { +; SKX-LABEL: trunc_qd_512_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqd %zmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <8 x i64> %i to <8 x i32> + store <8 x i32> %x, <8 x i32>* %res + ret void +} + +define <4 x i32> @trunc_qd_256(<4 x i64> %i) #0 { +; SKX-LABEL: trunc_qd_256: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqd %ymm0, %xmm0 +; SKX-NEXT: retq + %x = trunc <4 x i64> %i to <4 x i32> + ret <4 x i32> %x +} + +define void @trunc_qd_256_mem(<4 x i64> %i, <4 x i32>* %res) #0 { +; SKX-LABEL: trunc_qd_256_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqd %ymm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <4 x i64> %i to <4 x i32> + store <4 x i32> %x, <4 x i32>* %res + ret void +} + +define <2 x i32> @trunc_qd_128(<2 x i64> %i) #0 { +; SKX-LABEL: trunc_qd_128: +; SKX: ## BB#0: +; SKX-NEXT: retq + %x = trunc <2 x i64> %i to <2 x i32> + ret <2 x i32> %x +} + +define void @trunc_qd_128_mem(<2 x i64> %i, <2 x i32>* %res) #0 { +; SKX-LABEL: trunc_qd_128_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqd %xmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <2 x i64> %i to <2 x i32> + store <2 x i32> %x, <2 x i32>* %res + ret void +} + +define <16 x i8> @trunc_db_512(<16 x i32> %i) #0 { +; SKX-LABEL: trunc_db_512: +; SKX: ## BB#0: +; SKX-NEXT: vpmovdb %zmm0, %xmm0 +; SKX-NEXT: retq + %x = trunc <16 x i32> %i to <16 x i8> + ret <16 x i8> %x +} + +define void @trunc_db_512_mem(<16 x i32> %i, <16 x i8>* %res) #0 { +; SKX-LABEL: trunc_db_512_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovdb %zmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <16 x i32> %i to <16 x i8> + store <16 x i8> %x, <16 x i8>* %res + ret void +} + +define <8 x i8> @trunc_db_256(<8 x i32> %i) #0 { +; SKX-LABEL: trunc_db_256: +; SKX: ## BB#0: +; SKX-NEXT: vpmovdw %ymm0, %xmm0 +; SKX-NEXT: retq + %x = trunc <8 x i32> %i to <8 x i8> + ret <8 x i8> %x +} + +define void @trunc_db_256_mem(<8 x i32> %i, <8 x i8>* %res) #0 { +; SKX-LABEL: trunc_db_256_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovdb %ymm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <8 x i32> %i to <8 x i8> + store <8 x i8> %x, <8 x i8>* %res + ret void +} + +define <4 x i8> @trunc_db_128(<4 x i32> %i) #0 { +; SKX-LABEL: trunc_db_128: +; SKX: ## BB#0: +; SKX-NEXT: retq + %x = trunc <4 x i32> %i to <4 x i8> + ret <4 x i8> %x +} + +define void @trunc_db_128_mem(<4 x i32> %i, <4 x i8>* %res) #0 { +; SKX-LABEL: trunc_db_128_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovdb %xmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <4 x i32> %i to <4 x i8> + store <4 x i8> %x, <4 x i8>* %res + ret void +} + +define <16 x i16> @trunc_dw_512(<16 x i32> %i) #0 { +; SKX-LABEL: trunc_dw_512: +; SKX: ## BB#0: +; SKX-NEXT: vpmovdw %zmm0, %ymm0 +; SKX-NEXT: retq + %x = trunc <16 x i32> %i to <16 x i16> + ret <16 x i16> %x +} + +define void @trunc_dw_512_mem(<16 x i32> %i, <16 x i16>* %res) #0 { +; SKX-LABEL: trunc_dw_512_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovdw %zmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <16 x i32> %i to <16 x i16> + store <16 x i16> %x, <16 x i16>* %res + ret void +} + +define <8 x i16> @trunc_dw_256(<8 x i32> %i) #0 { +; SKX-LABEL: trunc_dw_256: +; SKX: ## BB#0: +; SKX-NEXT: vpmovdw %ymm0, %xmm0 +; SKX-NEXT: retq + %x = trunc <8 x i32> %i to <8 x i16> + ret <8 x i16> %x +} + +define void @trunc_dw_256_mem(<8 x i32> %i, <8 x i16>* %res) #0 { +; SKX-LABEL: trunc_dw_256_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovdw %ymm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <8 x i32> %i to <8 x i16> + store <8 x i16> %x, <8 x i16>* %res + ret void +} + +define <4 x i16> @trunc_dw_128(<4 x i32> %i) #0 { +; SKX-LABEL: trunc_dw_128: +; SKX: ## BB#0: +; SKX-NEXT: retq + %x = trunc <4 x i32> %i to <4 x i16> + ret <4 x i16> %x +} + +define void @trunc_dw_128_mem(<4 x i32> %i, <4 x i16>* %res) #0 { +; SKX-LABEL: trunc_dw_128_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovdw %xmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <4 x i32> %i to <4 x i16> + store <4 x i16> %x, <4 x i16>* %res + ret void +} + +define <32 x i8> @trunc_wb_512(<32 x i16> %i) #0 { +; SKX-LABEL: trunc_wb_512: +; SKX: ## BB#0: +; SKX-NEXT: vpmovwb %zmm0, %ymm0 +; SKX-NEXT: retq + %x = trunc <32 x i16> %i to <32 x i8> + ret <32 x i8> %x +} + +define void @trunc_wb_512_mem(<32 x i16> %i, <32 x i8>* %res) #0 { +; SKX-LABEL: trunc_wb_512_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovwb %zmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <32 x i16> %i to <32 x i8> + store <32 x i8> %x, <32 x i8>* %res + ret void +} + +define <16 x i8> @trunc_wb_256(<16 x i16> %i) #0 { +; SKX-LABEL: trunc_wb_256: +; SKX: ## BB#0: +; SKX-NEXT: vpmovwb %ymm0, %xmm0 +; SKX-NEXT: retq + %x = trunc <16 x i16> %i to <16 x i8> + ret <16 x i8> %x +} + +define void @trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) #0 { +; SKX-LABEL: trunc_wb_256_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovwb %ymm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <16 x i16> %i to <16 x i8> + store <16 x i8> %x, <16 x i8>* %res + ret void +} + +define <8 x i8> @trunc_wb_128(<8 x i16> %i) #0 { +; SKX-LABEL: trunc_wb_128: +; SKX: ## BB#0: +; SKX-NEXT: retq + %x = trunc <8 x i16> %i to <8 x i8> + ret <8 x i8> %x +} + +define void @trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) #0 { +; SKX-LABEL: trunc_wb_128_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovwb %xmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <8 x i16> %i to <8 x i8> + store <8 x i8> %x, <8 x i8>* %res + ret void +} diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll index b2b417df2f1..5ad28ab5ab5 100644 --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -1008,6 +1008,84 @@ define <32 x i16>@test_int_x86_avx512_mask_pmulhr_sw_512(<32 x i16> %x0, <32 x i ret <32 x i16> %res2 } +declare <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16>, <32 x i8>, i32) + +define <32 x i8>@test_int_x86_avx512_mask_pmov_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_512: +; CHECK: vpmovwb %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovwb %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovwb %zmm0, %ymm0 + %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1) + %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) + %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2) + %res3 = add <32 x i8> %res0, %res1 + %res4 = add <32 x i8> %res3, %res2 + ret <32 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16>, i32) + +define void @test_int_x86_avx512_mask_pmov_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_512: +; CHECK: vpmovwb %zmm0, (%rdi) +; CHECK: vpmovwb %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1) + call void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2) + ret void +} + +declare <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16>, <32 x i8>, i32) + +define <32 x i8>@test_int_x86_avx512_mask_pmovs_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_512: +; CHECK: vpmovswb %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovswb %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovswb %zmm0, %ymm0 + %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1) + %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) + %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2) + %res3 = add <32 x i8> %res0, %res1 + %res4 = add <32 x i8> %res3, %res2 + ret <32 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16>, i32) + +define void @test_int_x86_avx512_mask_pmovs_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_512: +; CHECK: vpmovswb %zmm0, (%rdi) +; CHECK: vpmovswb %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1) + call void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2) + ret void +} + +declare <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16>, <32 x i8>, i32) + +define <32 x i8>@test_int_x86_avx512_mask_pmovus_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_512: +; CHECK: vpmovuswb %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovuswb %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovuswb %zmm0, %ymm0 + %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1) + %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) + %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2) + %res3 = add <32 x i8> %res0, %res1 + %res4 = add <32 x i8> %res3, %res2 + ret <32 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16>, i32) + +define void @test_int_x86_avx512_mask_pmovus_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_512: +; CHECK: vpmovuswb %zmm0, (%rdi) +; CHECK: vpmovuswb %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1) + call void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2) + ret void +} + declare <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8>, <64 x i8>, <32 x i16>, i32) define <32 x i16>@test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3) { diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll index 2373dc089ae..ee76ae2a8a3 100644 --- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -3876,6 +3876,162 @@ define <16 x i16>@test_int_x86_avx512_mask_pmulhr_sw_256(<16 x i16> %x0, <16 x i ret <16 x i16> %res2 } +declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_128: +; CHECK: vpmovwb %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovwb %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovwb %xmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16>, i8) + +define void @test_int_x86_avx512_mask_pmov_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_128: +; CHECK: vpmovwb %xmm0, (%rdi) +; CHECK: vpmovwb %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_128: +; CHECK: vpmovswb %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovswb %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovswb %xmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16>, i8) + +define void @test_int_x86_avx512_mask_pmovs_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_128: +; CHECK: vpmovswb %xmm0, (%rdi) +; CHECK: vpmovswb %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_128: +; CHECK: vpmovuswb %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovuswb %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovuswb %xmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16>, i8) + +define void @test_int_x86_avx512_mask_pmovus_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_128: +; CHECK: vpmovuswb %xmm0, (%rdi) +; CHECK: vpmovuswb %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16>, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_256: +; CHECK: vpmovwb %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovwb %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovwb %ymm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16>, i16) + +define void @test_int_x86_avx512_mask_pmov_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_256: +; CHECK: vpmovwb %ymm0, (%rdi) +; CHECK: vpmovwb %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16>, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_256: +; CHECK: vpmovswb %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovswb %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovswb %ymm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16>, i16) + +define void @test_int_x86_avx512_mask_pmovs_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_256: +; CHECK: vpmovswb %ymm0, (%rdi) +; CHECK: vpmovswb %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16>, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_256: +; CHECK: vpmovuswb %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovuswb %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovuswb %ymm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16>, i16) + +define void @test_int_x86_avx512_mask_pmovus_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_256: +; CHECK: vpmovuswb %ymm0, (%rdi) +; CHECK: vpmovuswb %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2) + ret void +} + declare <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16>, <8 x i16>, <4 x i32>, i8) define <4 x i32>@test_int_x86_avx512_mask_pmaddw_d_128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 %x3) { diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll index 46ee51f47b6..7812148de1c 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -3005,6 +3005,786 @@ define <8 x float>@test_int_x86_avx512_mask_scalef_ps_256(<8 x float> %x0, <8 x ret <8 x float> %res2 } +declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_128: +; CHECK: vpmovqb %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovqb %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovqb %xmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qb.mem.128(i8* %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qb_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_128: +; CHECK: vpmovqb %xmm0, (%rdi) +; CHECK: vpmovqb %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_128: +; CHECK: vpmovsqb %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsqb %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsqb %xmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qb.mem.128(i8* %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qb_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_128: +; CHECK: vpmovsqb %xmm0, (%rdi) +; CHECK: vpmovsqb %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_128: +; CHECK: vpmovusqb %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusqb %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusqb %xmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qb.mem.128(i8* %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qb_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_128: +; CHECK: vpmovusqb %xmm0, (%rdi) +; CHECK: vpmovusqb %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_256: +; CHECK: vpmovqb %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovqb %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovqb %ymm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qb.mem.256(i8* %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qb_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_256: +; CHECK: vpmovqb %ymm0, (%rdi) +; CHECK: vpmovqb %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_256: +; CHECK: vpmovsqb %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsqb %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsqb %ymm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qb.mem.256(i8* %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qb_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_256: +; CHECK: vpmovsqb %ymm0, (%rdi) +; CHECK: vpmovsqb %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_256: +; CHECK: vpmovusqb %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusqb %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusqb %ymm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qb.mem.256(i8* %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qb_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_256: +; CHECK: vpmovusqb %ymm0, (%rdi) +; CHECK: vpmovusqb %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_128: +; CHECK: vpmovqw %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovqw %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovqw %xmm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qw.mem.128(i8* %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qw_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_128: +; CHECK: vpmovqw %xmm0, (%rdi) +; CHECK: vpmovqw %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_128: +; CHECK: vpmovsqw %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsqw %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsqw %xmm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qw.mem.128(i8* %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qw_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_128: +; CHECK: vpmovsqw %xmm0, (%rdi) +; CHECK: vpmovsqw %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_128: +; CHECK: vpmovusqw %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusqw %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusqw %xmm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qw.mem.128(i8* %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qw_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_128: +; CHECK: vpmovusqw %xmm0, (%rdi) +; CHECK: vpmovusqw %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_256: +; CHECK: vpmovqw %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovqw %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovqw %ymm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qw.mem.256(i8* %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qw_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_256: +; CHECK: vpmovqw %ymm0, (%rdi) +; CHECK: vpmovqw %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_256: +; CHECK: vpmovsqw %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsqw %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsqw %ymm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qw.mem.256(i8* %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qw_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_256: +; CHECK: vpmovsqw %ymm0, (%rdi) +; CHECK: vpmovsqw %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_256: +; CHECK: vpmovusqw %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusqw %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusqw %ymm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qw.mem.256(i8* %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qw_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_256: +; CHECK: vpmovusqw %ymm0, (%rdi) +; CHECK: vpmovusqw %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_128: +; CHECK: vpmovqd %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovqd %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovqd %xmm0, %xmm0 + %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) + %res3 = add <4 x i32> %res0, %res1 + %res4 = add <4 x i32> %res3, %res2 + ret <4 x i32> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qd.mem.128(i8* %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qd_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_128: +; CHECK: vpmovqd %xmm0, (%rdi) +; CHECK: vpmovqd %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_pmovs_qd_128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_128: +; CHECK: vpmovsqd %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsqd %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsqd %xmm0, %xmm0 + %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) + %res3 = add <4 x i32> %res0, %res1 + %res4 = add <4 x i32> %res3, %res2 + ret <4 x i32> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qd.mem.128(i8* %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qd_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_128: +; CHECK: vpmovsqd %xmm0, (%rdi) +; CHECK: vpmovsqd %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_pmovus_qd_128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_128: +; CHECK: vpmovusqd %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusqd %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusqd %xmm0, %xmm0 + %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) + %res3 = add <4 x i32> %res0, %res1 + %res4 = add <4 x i32> %res3, %res2 + ret <4 x i32> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qd.mem.128(i8* %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qd_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_128: +; CHECK: vpmovusqd %xmm0, (%rdi) +; CHECK: vpmovusqd %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_256: +; CHECK: vpmovqd %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovqd %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovqd %ymm0, %xmm0 + %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) + %res3 = add <4 x i32> %res0, %res1 + %res4 = add <4 x i32> %res3, %res2 + ret <4 x i32> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qd.mem.256(i8* %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qd_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_256: +; CHECK: vpmovqd %ymm0, (%rdi) +; CHECK: vpmovqd %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_pmovs_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_256: +; CHECK: vpmovsqd %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsqd %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsqd %ymm0, %xmm0 + %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) + %res3 = add <4 x i32> %res0, %res1 + %res4 = add <4 x i32> %res3, %res2 + ret <4 x i32> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qd.mem.256(i8* %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qd_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_256: +; CHECK: vpmovsqd %ymm0, (%rdi) +; CHECK: vpmovsqd %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_pmovus_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_256: +; CHECK: vpmovusqd %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusqd %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusqd %ymm0, %xmm0 + %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) + %res3 = add <4 x i32> %res0, %res1 + %res4 = add <4 x i32> %res3, %res2 + ret <4 x i32> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qd.mem.256(i8* %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qd_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_256: +; CHECK: vpmovusqd %ymm0, (%rdi) +; CHECK: vpmovusqd %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_128: +; CHECK: vpmovdb %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovdb %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovdb %xmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.db.mem.128(i8* %ptr, <4 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmov_db_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_mem_128: +; CHECK: vpmovdb %xmm0, (%rdi) +; CHECK: vpmovdb %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_128: +; CHECK: vpmovsdb %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsdb %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsdb %xmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.db.mem.128(i8* %ptr, <4 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmovs_db_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_128: +; CHECK: vpmovsdb %xmm0, (%rdi) +; CHECK: vpmovsdb %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_128: +; CHECK: vpmovusdb %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusdb %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusdb %xmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.db.mem.128(i8* %ptr, <4 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmovus_db_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_128: +; CHECK: vpmovusdb %xmm0, (%rdi) +; CHECK: vpmovusdb %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_256: +; CHECK: vpmovdb %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovdb %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovdb %ymm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.db.mem.256(i8* %ptr, <8 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmov_db_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_mem_256: +; CHECK: vpmovdb %ymm0, (%rdi) +; CHECK: vpmovdb %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_256: +; CHECK: vpmovsdb %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsdb %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsdb %ymm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.db.mem.256(i8* %ptr, <8 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmovs_db_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_256: +; CHECK: vpmovsdb %ymm0, (%rdi) +; CHECK: vpmovsdb %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_256: +; CHECK: vpmovusdb %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusdb %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusdb %ymm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.db.mem.256(i8* %ptr, <8 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmovus_db_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_256: +; CHECK: vpmovusdb %ymm0, (%rdi) +; CHECK: vpmovusdb %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_128: +; CHECK: vpmovdw %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovdw %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovdw %xmm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.dw.mem.128(i8* %ptr, <4 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmov_dw_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_128: +; CHECK: vpmovdw %xmm0, (%rdi) +; CHECK: vpmovdw %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_128: +; CHECK: vpmovsdw %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsdw %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsdw %xmm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.dw.mem.128(i8* %ptr, <4 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmovs_dw_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_128: +; CHECK: vpmovsdw %xmm0, (%rdi) +; CHECK: vpmovsdw %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_128: +; CHECK: vpmovusdw %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusdw %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusdw %xmm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.dw.mem.128(i8* %ptr, <4 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmovus_dw_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_128: +; CHECK: vpmovusdw %xmm0, (%rdi) +; CHECK: vpmovusdw %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_256: +; CHECK: vpmovdw %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovdw %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovdw %ymm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.dw.mem.256(i8* %ptr, <8 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmov_dw_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_256: +; CHECK: vpmovdw %ymm0, (%rdi) +; CHECK: vpmovdw %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_256: +; CHECK: vpmovsdw %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsdw %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsdw %ymm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.dw.mem.256(i8* %ptr, <8 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmovs_dw_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_256: +; CHECK: vpmovsdw %ymm0, (%rdi) +; CHECK: vpmovsdw %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_256: +; CHECK: vpmovusdw %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusdw %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusdw %ymm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.dw.mem.256(i8* %ptr, <8 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmovus_dw_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_256: +; CHECK: vpmovusdw %ymm0, (%rdi) +; CHECK: vpmovusdw %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2) + ret void +} + declare <2 x double> @llvm.x86.avx512.mask.cvtdq2pd.128(<4 x i32>, <2 x double>, i8) define <2 x double>@test_int_x86_avx512_mask_cvt_dq2pd_128(<4 x i32> %x0, <2 x double> %x1, i8 %x2) { diff --git a/llvm/test/CodeGen/X86/masked_memop.ll b/llvm/test/CodeGen/X86/masked_memop.ll index 6c16e634a59..f51d4fa103e 100644 --- a/llvm/test/CodeGen/X86/masked_memop.ll +++ b/llvm/test/CodeGen/X86/masked_memop.ll @@ -190,10 +190,13 @@ define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) { ; AVX2-LABEL: test15 ; AVX2: vpmaskmovd -; SKX-LABEL: test15 -; SKX: kshiftl -; SKX: kshiftr -; SKX: vmovdqu32 {{.*}}{%k1} +; SKX-LABEL: test15: +; SKX: ## BB#0: +; SKX-NEXT: vpandq {{.*}}(%rip), %xmm0, %xmm0 +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1 +; SKX-NEXT: vpmovqd %xmm1, (%rdi) {%k1} +; SKX-NEXT: retq define void @test15(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) { %mask = icmp eq <2 x i32> %trigger, zeroinitializer call void @llvm.masked.store.v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask) |

