diff options
| author | Igor Breger <igor.breger@intel.com> | 2016-02-07 08:30:50 +0000 |
|---|---|---|
| committer | Igor Breger <igor.breger@intel.com> | 2016-02-07 08:30:50 +0000 |
| commit | 0aeda3746443657cb1c0df4da18137e4a49659f5 (patch) | |
| tree | e653e9fdefdf589e00aef590dc44264809f2757c /llvm/test | |
| parent | 46bafbd0fe5236fb3d17851f296e7b3437929c35 (diff) | |
| download | bcm5719-llvm-0aeda3746443657cb1c0df4da18137e4a49659f5.tar.gz bcm5719-llvm-0aeda3746443657cb1c0df4da18137e4a49659f5.zip | |
AVX512: VPBROADCASTB/W/D/Q from GPR intrinsics implementation.
Differential Revision: http://reviews.llvm.org/D16813
llvm-svn: 260024
Diffstat (limited to 'llvm/test')
| -rw-r--r-- | llvm/test/CodeGen/X86/avx-isa-check.ll | 75 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512-intrinsics.ll | 59 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512bw-intrinsics.ll | 64 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll | 79 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512vl-intrinsics.ll | 79 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/masked_gather_scatter.ll | 3 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-v1.ll | 54 |
7 files changed, 363 insertions, 50 deletions
diff --git a/llvm/test/CodeGen/X86/avx-isa-check.ll b/llvm/test/CodeGen/X86/avx-isa-check.ll index 7664b7d2093..7a7959d29bb 100644 --- a/llvm/test/CodeGen/X86/avx-isa-check.ll +++ b/llvm/test/CodeGen/X86/avx-isa-check.ll @@ -1,5 +1,6 @@ ; check AVX2 instructions that are disabled in case avx512VL/avx512BW present - + +; RUN: llc < %s -mtriple=x86_64-apple-darwin -show-mc-encoding -mcpu=corei7-avx -o /dev/null ; RUN: llc < %s -mtriple=x86_64-apple-darwin -show-mc-encoding -mcpu=core-avx2 -mattr=+avx2 -o /dev/null ; RUN: llc < %s -mtriple=x86_64-apple-darwin -show-mc-encoding -mcpu=knl -o /dev/null ; RUN: llc < %s -mtriple=x86_64-apple-darwin -show-mc-encoding -mcpu=knl -mattr=+avx512vl -o /dev/null @@ -576,6 +577,78 @@ entry: ret <8 x i16> %C } +define <32 x i8> @_broadcast32xi8(i8 %a) { + %b = insertelement <32 x i8> undef, i8 %a, i32 0 + %c = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer + ret <32 x i8> %c +} + +define <16 x i8> @_broadcast16xi8(i8 %a) { + %b = insertelement <16 x i8> undef, i8 %a, i32 0 + %c = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer + ret <16 x i8> %c +} + +define <16 x i16> @_broadcast16xi16(i16 %a) { + %b = insertelement <16 x i16> undef, i16 %a, i32 0 + %c = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer + ret <16 x i16> %c +} + +define <8 x i16> @_broadcast8xi16(i16 %a) { + %b = insertelement <8 x i16> undef, i16 %a, i32 0 + %c = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer + ret <8 x i16> %c +} + +define <8 x i32> @_broadcast8xi32(i32 %a) { + %b = insertelement <8 x i32> undef, i32 %a, i32 0 + %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer + ret <8 x i32> %c +} + +define <4 x i32> @_broadcast4xi32(i32 %a) { + %b = insertelement <4 x i32> undef, i32 %a, i32 0 + %c = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer + ret <4 x i32> %c +} + +define <4 x i64> @_broadcast4xi64(i64 %a) { + %b = insertelement <4 x i64> undef, i64 %a, i64 0 + %c = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer + ret <4 x i64> %c +} + +define <2 x i64> @_broadcast2xi64(i64 %a) { + %b = insertelement <2 x i64> undef, i64 %a, i64 0 + %c = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer + ret <2 x i64> %c +} + +define <8 x float> @_broadcast8xfloat(float %a) { + %b = insertelement <8 x float> undef, float %a, i32 0 + %c = shufflevector <8 x float> %b, <8 x float> undef, <8 x i32> zeroinitializer + ret <8 x float> %c +} + +define <4 x float> @_broadcast4xfloat(float %a) { + %b = insertelement <4 x float> undef, float %a, i32 0 + %c = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer + ret <4 x float> %c +} + +define <4 x double> @_broadcast4xdouble(double %a) { + %b = insertelement <4 x double> undef, double %a, i32 0 + %c = shufflevector <4 x double> %b, <4 x double> undef, <4 x i32> zeroinitializer + ret <4 x double> %c +} + +define <2 x double> @_broadcast2xdouble(double %a) { + %b = insertelement <2 x double> undef, double %a, i32 0 + %c = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer + ret <2 x double> %c +} + define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { %x = fmul <4 x float> %a0, %a1 %res = fsub <4 x float> %x, %a2 diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index c21379f0840..756a3b5e510 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -574,16 +574,6 @@ define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32> } declare <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32>, <16 x i32>, i16) -define <16 x i32> @test_x86_pbroadcastd_i32_512(i32 %a0) { -; CHECK-LABEL: test_x86_pbroadcastd_i32_512: -; CHECK: ## BB#0: -; CHECK-NEXT: vpbroadcastd %edi, %zmm0 -; CHECK-NEXT: retq - %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.i32.512(i32 %a0) ; <<16 x i32>> [#uses=1] - ret <16 x i32> %res -} -declare <16 x i32> @llvm.x86.avx512.pbroadcastd.i32.512(i32) nounwind readonly - define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x1, i8 %mask) { ; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_512: ; CHECK: ## BB#0: @@ -603,16 +593,6 @@ define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x } declare <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64>, <8 x i64>, i8) -define <8 x i64> @test_x86_pbroadcastq_i64_512(i64 %a0) { -; CHECK-LABEL: test_x86_pbroadcastq_i64_512: -; CHECK: ## BB#0: -; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 -; CHECK-NEXT: retq - %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.i64.512(i64 %a0) ; <<8 x i64>> [#uses=1] - ret <8 x i64> %res -} -declare <8 x i64> @llvm.x86.avx512.pbroadcastq.i64.512(i64) nounwind readonly - define <16 x i32> @test_conflict_d(<16 x i32> %a) { ; CHECK-LABEL: test_conflict_d: ; CHECK: ## BB#0: @@ -7357,6 +7337,45 @@ define i8@test_int_x86_avx512_ptestnm_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2 ret i8 %res2 } +define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastd_gpr_512: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1} +; CHECK-NEXT: vpbroadcastd %edi, %zmm1 {%k1} {z} +; CHECK-NEXT: vpbroadcastd %edi, %zmm2 +; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 -1) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 %mask) + %res2 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> zeroinitializer, i16 %mask) + %res3 = add <16 x i32> %res, %res1 + %res4 = add <16 x i32> %res2, %res3 + ret <16 x i32> %res4 +} + +declare <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32, <16 x i32>, i16) + +define <8 x i64>@test_int_x86_avx512_mask_pbroadcastq_gpr_512(i64 %x0, <8 x i64> %x1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastq_gpr_512: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1} +; CHECK-NEXT: vpbroadcastq %rdi, %zmm1 {%k1} {z} +; CHECK-NEXT: vpbroadcastq %rdi, %zmm2 +; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 -1) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 %mask) + %res2 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> zeroinitializer,i8 %mask) + %res3 = add <8 x i64> %res, %res1 + %res4 = add <8 x i64> %res2, %res3 + ret <8 x i64> %res4 +} +declare <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64, <8 x i64>, i8) + declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll index 26a80e3f4a6..5592f7f61ac 100644 --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -3516,3 +3516,67 @@ define i32@test_int_x86_avx512_ptestnm_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %res2 = add i32 %res, %res1 ret i32 %res2 } + +declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64) + +define <64 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovq %rsi, %k1 +; AVX512BW-NEXT: vpbroadcastb %dil, %zmm0 {%k1} +; AVX512BW-NEXT: vpbroadcastb %dil, %zmm1 {%k1} {z} +; AVX512BW-NEXT: vpbroadcastb %dil, %zmm2 +; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: movb {{[0-9]+}}(%esp), %al +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0 +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1 +; AVX512F-32-NEXT: vpbroadcastb %al, %zmm1 {%k1} {z} +; AVX512F-32-NEXT: vpbroadcastb %al, %zmm0 {%k1} +; AVX512F-32-NEXT: vpbroadcastb %al, %zmm2 +; AVX512F-32-NEXT: vpaddb %zmm0, %zmm2, %zmm0 +; AVX512F-32-NEXT: vpaddb %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 -1) + %res1 = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 %mask) + %res2 = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> zeroinitializer, i64 %mask) + %res3 = add <64 x i8> %res, %res1 + %res4 = add <64 x i8> %res2, %res3 + ret <64 x i8> %res4 +} + +declare <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16, <32 x i16>, i32) + +define <32 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_512(i16 %x0, <32 x i16> %x1, i32 %mask) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpbroadcastw %di, %zmm0 {%k1} +; AVX512BW-NEXT: vpbroadcastw %di, %zmm1 {%k1} {z} +; AVX512BW-NEXT: vpbroadcastw %di, %zmm2 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: movw {{[0-9]+}}(%esp), %ax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpbroadcastw %ax, %zmm0 {%k1} +; AVX512F-32-NEXT: vpbroadcastw %ax, %zmm1 {%k1} {z} +; AVX512F-32-NEXT: vpbroadcastw %ax, %zmm2 +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0 +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 -1) + %res1 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 %mask) + %res2 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> zeroinitializer, i32 %mask) + %res3 = add <32 x i16> %res, %res1 + %res4 = add <32 x i16> %res2, %res3 + ret <32 x i16> %res4 +} diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll index 16fc0d51ac3..ad16ba1845d 100644 --- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -5450,3 +5450,82 @@ define i16@test_int_x86_avx512_ptestnm_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 ret i16 %res2 } +declare <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8, <32 x i8>, i32) + +define <32 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_256(i8 %x0, <32 x i8> %x1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_256: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vpbroadcastb %dil, %ymm0 {%k1} +; CHECK-NEXT: vpbroadcastb %dil, %ymm1 {%k1} {z} +; CHECK-NEXT: vpbroadcastb %dil, %ymm2 +; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 -1) + %res1 = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 %mask) + %res2 = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> zeroinitializer, i32 %mask) + %res3 = add <32 x i8> %res, %res1 + %res4 = add <32 x i8> %res2, %res3 + ret <32 x i8> %res4 +} + +declare <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_128(i8 %x0, <16 x i8> %x1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_128: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpbroadcastb %dil, %xmm0 {%k1} +; CHECK-NEXT: vpbroadcastb %dil, %xmm1 {%k1} {z} +; CHECK-NEXT: vpbroadcastb %dil, %xmm2 +; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 %mask) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> zeroinitializer, i16 %mask) + %res3 = add <16 x i8> %res, %res1 + %res4 = add <16 x i8> %res2, %res3 + ret <16 x i8> %res4 +} + +declare <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_256(i16 %x0, <16 x i16> %x1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_256: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpbroadcastw %di, %ymm0 {%k1} +; CHECK-NEXT: vpbroadcastw %di, %ymm1 {%k1} {z} +; CHECK-NEXT: vpbroadcastw %di, %ymm2 +; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 -1) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 %mask) + %res2 = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> zeroinitializer, i16 %mask) + %res3 = add <16 x i16> %res, %res1 + %res4 = add <16 x i16> %res2, %res3 + ret <16 x i16> %res4 +} + +declare <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_128(i16 %x0, <8 x i16> %x1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_128: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpbroadcastw %di, %xmm0 {%k1} +; CHECK-NEXT: vpbroadcastw %di, %xmm1 {%k1} {z} +; CHECK-NEXT: vpbroadcastw %di, %xmm2 +; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 %mask) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> zeroinitializer, i8 %mask) + %res3 = add <8 x i16> %res, %res1 + %res4 = add <8 x i16> %res2, %res3 + ret <8 x i16> %res4 +} diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll index 157adeee823..293ebdf915d 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -8236,3 +8236,82 @@ define i8@test_int_x86_avx512_ptestnm_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2 ret i8 %res2 } +declare <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_pbroadcast_d_gpr_256(i32 %x0, <8 x i32> %x1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_256: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpbroadcastd %edi, %ymm0 {%k1} +; CHECK-NEXT: vpbroadcastd %edi, %ymm1 {%k1} {z} +; CHECK-NEXT: vpbroadcastd %edi, %ymm2 +; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> %x1, i8 -1) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> %x1, i8 %mask) + %res2 = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> zeroinitializer, i8 %mask) + %res3 = add <8 x i32> %res, %res1 + %res4 = add <8 x i32> %res2, %res3 + ret <8 x i32> %res4 +} + +declare <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_pbroadcast_d_gpr_128(i32 %x0, <4 x i32> %x1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_128: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpbroadcastd %edi, %xmm0 {%k1} +; CHECK-NEXT: vpbroadcastd %edi, %xmm1 {%k1} {z} +; CHECK-NEXT: vpbroadcastd %edi, %xmm2 +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> %x1, i8 -1) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> %x1, i8 %mask) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> zeroinitializer, i8 %mask) + %res3 = add <4 x i32> %res, %res1 + %res4 = add <4 x i32> %res2, %res3 + ret <4 x i32> %res4 +} + +declare <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_mask_pbroadcast_q_gpr_256(i64 %x0, <4 x i64> %x1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_q_gpr_256: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpbroadcastq %rdi, %ymm0 {%k1} +; CHECK-NEXT: vpbroadcastq %rdi, %ymm1 {%k1} {z} +; CHECK-NEXT: vpbroadcastq %rdi, %ymm2 +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> %x1,i8 -1) + %res1 = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> %x1,i8 %mask) + %res2 = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> zeroinitializer,i8 %mask) + %res3 = add <4 x i64> %res, %res1 + %res4 = add <4 x i64> %res2, %res3 + ret <4 x i64> %res4 +} + +declare <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64, <2 x i64>, i8) + +define <2 x i64>@test_int_x86_avx512_mask_pbroadcast_q_gpr_128(i64 %x0, <2 x i64> %x1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_q_gpr_128: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpbroadcastq %rdi, %xmm0 {%k1} +; CHECK-NEXT: vpbroadcastq %rdi, %xmm1 {%k1} {z} +; CHECK-NEXT: vpbroadcastq %rdi, %xmm2 +; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> %x1,i8 -1) + %res1 = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> %x1,i8 %mask) + %res2 = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> zeroinitializer,i8 %mask) + %res3 = add <2 x i64> %res, %res1 + %res4 = add <2 x i64> %res2, %res3 + ret <2 x i64> %res4 +} diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index 9f8e819cad5..9588467e96d 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -637,8 +637,7 @@ define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) { ; SKX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1 ; SKX-NEXT: vinserti64x2 $0, %xmm1, %zmm0, %zmm0 ; SKX-NEXT: vpbroadcastq %xmm0, %zmm0 -; SKX-NEXT: vmovd %esi, %xmm1 -; SKX-NEXT: vpbroadcastd %xmm1, %ymm1 +; SKX-NEXT: vpbroadcastd %esi, %ymm1 ; SKX-NEXT: vpmovsxdq %ymm1, %zmm1 ; SKX-NEXT: vpsllq $2, %zmm1, %zmm1 ; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll index f9ad5a4cc45..8edc8193189 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll @@ -74,13 +74,13 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> % ; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 -; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,1,0,3,7,7,0] -; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1 -; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1 -; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: movq {{.*}}(%rip), %rax +; AVX512F-NEXT: vpbroadcastq %rax, %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpbroadcastq %rax, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: retq ; @@ -105,14 +105,14 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1 ; AVX512F: # BB#0: ; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 ; AVX512F-NEXT: vpcmpeqd %zmm3, %zmm1, %k2 -; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} {z} -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} {z} -; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] -; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512F-NEXT: vpslld $31, %zmm2, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: movl {{.*}}(%rip), %eax +; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k2} {z} +; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z} +; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 +; AVX512F-NEXT: vpslld $31, %zmm1, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: retq ; @@ -163,13 +163,13 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) { ; AVX512F-LABEL: shuf8i1_u_2_u_u_2_u_2_u: ; AVX512F: # BB#0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z} -; AVX512F-NEXT: vextracti32x4 $1, %zmm1, %xmm1 -; AVX512F-NEXT: vpbroadcastq %xmm1, %zmm1 -; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1 -; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: movq {{.*}}(%rip), %rax +; AVX512F-NEXT: vpbroadcastq %rax, %zmm0 {%k1} {z} +; AVX512F-NEXT: vextracti32x4 $1, %zmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastq %xmm0, %zmm0 +; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpbroadcastq %rax, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: retq ; @@ -310,12 +310,12 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) { ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: movb $51, %al ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} {z} -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: movq {{.*}}(%rip), %rax +; AVX512F-NEXT: vpbroadcastq %rax, %zmm0 {%k2} {z} +; AVX512F-NEXT: vpbroadcastq %rax, %zmm1 {%k1} {z} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: retq |

