diff options
Diffstat (limited to 'llvm/test/CodeGen/X86/vector-fshl-256.ll')
-rw-r--r-- | llvm/test/CodeGen/X86/vector-fshl-256.ll | 486 |
1 files changed, 456 insertions, 30 deletions
diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll index aac4662cff0..8aeae700d6a 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -4,7 +4,9 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2 @@ -113,6 +115,22 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) ; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512BW-NEXT: retq ; +; AVX512VBMI2-LABEL: var_funnnel_v4i64: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] +; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 +; AVX512VBMI2-NEXT: vpsllvq %ymm4, %ymm0, %ymm5 +; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64] +; AVX512VBMI2-NEXT: vpsubq %ymm4, %ymm6, %ymm4 +; AVX512VBMI2-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1 +; AVX512VBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1 +; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512VBMI2-NEXT: retq +; ; AVX512VLBW-LABEL: var_funnnel_v4i64: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] @@ -127,6 +145,11 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) ; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VLVBMI2-LABEL: var_funnnel_v4i64: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpshldvq %ymm2, %ymm1, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: var_funnnel_v4i64: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 @@ -278,6 +301,22 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) ; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512BW-NEXT: retq ; +; AVX512VBMI2-LABEL: var_funnnel_v8i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] +; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 +; AVX512VBMI2-NEXT: vpsllvd %ymm4, %ymm0, %ymm5 +; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32] +; AVX512VBMI2-NEXT: vpsubd %ymm4, %ymm6, %ymm4 +; AVX512VBMI2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1 +; AVX512VBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1 +; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512VBMI2-NEXT: retq +; ; AVX512VLBW-LABEL: var_funnnel_v8i32: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] @@ -292,6 +331,11 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) ; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VLVBMI2-LABEL: var_funnnel_v8i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpshldvd %ymm2, %ymm1, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: var_funnnel_v8i32: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 @@ -485,6 +529,23 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> % ; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512BW-NEXT: retq ; +; AVX512VBMI2-LABEL: var_funnnel_v16i16: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 +; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 +; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm5 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VBMI2-NEXT: vpsubw %ymm4, %ymm6, %ymm4 +; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1 +; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} +; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512VBMI2-NEXT: retq +; ; AVX512VLBW-LABEL: var_funnnel_v16i16: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] @@ -499,6 +560,11 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> % ; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VLVBMI2-LABEL: var_funnnel_v16i16: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpshldvw %ymm2, %ymm1, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: var_funnnel_v16i16: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 @@ -745,6 +811,27 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) ; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512BW-NEXT: retq ; +; AVX512VBMI2-LABEL: var_funnnel_v32i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VBMI2-NEXT: vpsllvw %zmm5, %zmm6, %zmm5 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %ymm4, %ymm6, %ymm4 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vporq %zmm1, %zmm5, %zmm1 +; AVX512VBMI2-NEXT: vpmovwb %zmm1, %ymm1 +; AVX512VBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} +; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512VBMI2-NEXT: retq +; ; AVX512VLBW-LABEL: var_funnnel_v32i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] @@ -764,6 +851,25 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) ; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VLVBMI2-LABEL: var_funnnel_v32i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLVBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VLVBMI2-NEXT: vpsllvw %zmm5, %zmm6, %zmm5 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %ymm4, %ymm6, %ymm4 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512VLVBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm5, %zmm1 +; AVX512VLVBMI2-NEXT: vpmovwb %zmm1, %ymm1 +; AVX512VLVBMI2-NEXT: vptestnmb %ymm3, %ymm2, %k1 +; AVX512VLVBMI2-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} +; AVX512VLVBMI2-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: var_funnnel_v32i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 @@ -906,6 +1012,22 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512BW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatvar_funnnel_v4i64: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] +; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 +; AVX512VBMI2-NEXT: vpsllq %xmm4, %ymm0, %ymm5 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] +; AVX512VBMI2-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 +; AVX512VBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1 +; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512VBMI2-NEXT: retq +; ; AVX512VLBW-LABEL: splatvar_funnnel_v4i64: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %ymm2 @@ -921,6 +1043,12 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i64: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512VLVBMI2-NEXT: vpshldvq %ymm2, %ymm1, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatvar_funnnel_v4i64: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0] @@ -1064,6 +1192,24 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> % ; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512BW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatvar_funnnel_v8i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %ymm2 +; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] +; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 +; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero +; AVX512VBMI2-NEXT: vpslld %xmm5, %ymm0, %ymm5 +; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] +; AVX512VBMI2-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero +; AVX512VBMI2-NEXT: vpsrld %xmm4, %ymm1, %ymm1 +; AVX512VBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1 +; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512VBMI2-NEXT: retq +; ; AVX512VLBW-LABEL: splatvar_funnnel_v8i32: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %ymm2 @@ -1081,6 +1227,12 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> % ; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm2, %ymm2 +; AVX512VLVBMI2-NEXT: vpshldvd %ymm2, %ymm1, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatvar_funnnel_v8i32: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,0,0,0] @@ -1228,6 +1380,24 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512BW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatvar_funnnel_v16i16: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %ymm2 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 +; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX512VBMI2-NEXT: vpsllw %xmm5, %ymm0, %ymm5 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] +; AVX512VBMI2-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX512VBMI2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 +; AVX512VBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1 +; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} +; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512VBMI2-NEXT: retq +; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %ymm2 @@ -1245,6 +1415,12 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i16: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm2, %ymm2 +; AVX512VLVBMI2-NEXT: vpshldvw %ymm2, %ymm1, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatvar_funnnel_v16i16: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] @@ -1430,6 +1606,27 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512BW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatvar_funnnel_v32i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %ymm2 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VBMI2-NEXT: vpsllvw %zmm5, %zmm6, %zmm5 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %ymm4, %ymm6, %ymm4 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vporq %zmm1, %zmm5, %zmm1 +; AVX512VBMI2-NEXT: vpmovwb %zmm1, %ymm1 +; AVX512VBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} +; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512VBMI2-NEXT: retq +; ; AVX512VLBW-LABEL: splatvar_funnnel_v32i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %ymm2 @@ -1450,6 +1647,26 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %ymm2 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLVBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VLVBMI2-NEXT: vpsllvw %zmm5, %zmm6, %zmm5 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %ymm4, %ymm6, %ymm4 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512VLVBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm5, %zmm1 +; AVX512VLVBMI2-NEXT: vpmovwb %zmm1, %ymm1 +; AVX512VLVBMI2-NEXT: vptestnmb %ymm3, %ymm2, %k1 +; AVX512VLVBMI2-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} +; AVX512VLVBMI2-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatvar_funnnel_v32i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 @@ -1537,12 +1754,45 @@ define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: constant_funnnel_v4i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %ymm1, %ymm1 -; AVX512-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: constant_funnnel_v4i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpsrlvq {{.*}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: constant_funnnel_v4i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsrlvq {{.*}}(%rip), %ymm1, %ymm1 +; AVX512VL-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: constant_funnnel_v4i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsrlvq {{.*}}(%rip), %ymm1, %ymm1 +; AVX512BW-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: retq +; +; AVX512VBMI2-LABEL: constant_funnnel_v4i64: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpsrlvq {{.*}}(%rip), %ymm1, %ymm1 +; AVX512VBMI2-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLBW-LABEL: constant_funnnel_v4i64: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpsrlvq {{.*}}(%rip), %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_funnnel_v4i64: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpshldvq {{.*}}(%rip), %ymm1, %ymm0 +; AVX512VLVBMI2-NEXT: retq ; ; XOPAVX1-LABEL: constant_funnnel_v4i64: ; XOPAVX1: # %bb.0: @@ -1600,12 +1850,45 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind { ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: constant_funnnel_v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %ymm1, %ymm1 -; AVX512-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: constant_funnnel_v8i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: constant_funnnel_v8i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %ymm1, %ymm1 +; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: constant_funnnel_v8i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsrlvd {{.*}}(%rip), %ymm1, %ymm1 +; AVX512BW-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: retq +; +; AVX512VBMI2-LABEL: constant_funnnel_v8i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpsrlvd {{.*}}(%rip), %ymm1, %ymm1 +; AVX512VBMI2-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLBW-LABEL: constant_funnnel_v8i32: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpsrlvd {{.*}}(%rip), %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_funnnel_v8i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpshldvd {{.*}}(%rip), %ymm1, %ymm0 +; AVX512VLVBMI2-NEXT: retq ; ; XOPAVX1-LABEL: constant_funnnel_v8i32: ; XOPAVX1: # %bb.0: @@ -1693,6 +1976,19 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwin ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: retq ; +; AVX512VBMI2-LABEL: constant_funnnel_v16i16: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1] +; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm2 +; AVX512VBMI2-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512VBMI2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX512VBMI2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VBMI2-NEXT: retq +; ; AVX512VLBW-LABEL: constant_funnnel_v16i16: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %ymm1, %ymm1 @@ -1702,6 +1998,11 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwin ; AVX512VLBW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512VLBW-NEXT: retq ; +; AVX512VLVBMI2-LABEL: constant_funnnel_v16i16: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpshldvw {{.*}}(%rip), %ymm1, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: constant_funnnel_v16i16: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshlw {{.*}}(%rip), %xmm1, %xmm2 @@ -1866,6 +2167,18 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { ; AVX512BW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq ; +; AVX512VBMI2-LABEL: constant_funnnel_v32i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512VBMI2-NEXT: vpsrlvw {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VBMI2-NEXT: vporq %zmm1, %zmm2, %zmm1 +; AVX512VBMI2-NEXT: vpmovwb %zmm1, %ymm1 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX512VBMI2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: retq +; ; AVX512VLBW-LABEL: constant_funnnel_v32i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero @@ -1880,6 +2193,20 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { ; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VLVBMI2-LABEL: constant_funnnel_v32i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm2, %zmm1 +; AVX512VLVBMI2-NEXT: vpmovwb %zmm1, %ymm1 +; AVX512VLVBMI2-NEXT: movl $16843009, %eax # imm = 0x1010101 +; AVX512VLVBMI2-NEXT: kmovd %eax, %k1 +; AVX512VLVBMI2-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} +; AVX512VLVBMI2-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: constant_funnnel_v32i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 @@ -1942,12 +2269,45 @@ define <4 x i64> @splatconstant_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y) nounwi ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: splatconstant_funnnel_v4i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrlq $50, %ymm1, %ymm1 -; AVX512-NEXT: vpsllq $14, %ymm0, %ymm0 -; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: splatconstant_funnnel_v4i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpsrlq $50, %ymm1, %ymm1 +; AVX512F-NEXT: vpsllq $14, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_funnnel_v4i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsrlq $50, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsllq $14, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: splatconstant_funnnel_v4i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsrlq $50, %ymm1, %ymm1 +; AVX512BW-NEXT: vpsllq $14, %ymm0, %ymm0 +; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: retq +; +; AVX512VBMI2-LABEL: splatconstant_funnnel_v4i64: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpsrlq $50, %ymm1, %ymm1 +; AVX512VBMI2-NEXT: vpsllq $14, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLBW-LABEL: splatconstant_funnnel_v4i64: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpsrlq $50, %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpsllq $14, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v4i64: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpshldq $14, %ymm1, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_funnnel_v4i64: ; XOPAVX1: # %bb.0: @@ -1993,12 +2353,45 @@ define <8 x i32> @splatconstant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwi ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: splatconstant_funnnel_v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrld $28, %ymm1, %ymm1 -; AVX512-NEXT: vpslld $4, %ymm0, %ymm0 -; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: splatconstant_funnnel_v8i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpsrld $28, %ymm1, %ymm1 +; AVX512F-NEXT: vpslld $4, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_funnnel_v8i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsrld $28, %ymm1, %ymm1 +; AVX512VL-NEXT: vpslld $4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: splatconstant_funnnel_v8i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsrld $28, %ymm1, %ymm1 +; AVX512BW-NEXT: vpslld $4, %ymm0, %ymm0 +; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: retq +; +; AVX512VBMI2-LABEL: splatconstant_funnnel_v8i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpsrld $28, %ymm1, %ymm1 +; AVX512VBMI2-NEXT: vpslld $4, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLBW-LABEL: splatconstant_funnnel_v8i32: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpsrld $28, %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpslld $4, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v8i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpshldd $4, %ymm1, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_funnnel_v8i32: ; XOPAVX1: # %bb.0: @@ -2044,12 +2437,45 @@ define <16 x i16> @splatconstant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) no ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: splatconstant_funnnel_v16i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrlw $9, %ymm1, %ymm1 -; AVX512-NEXT: vpsllw $7, %ymm0, %ymm0 -; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: splatconstant_funnnel_v16i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpsrlw $9, %ymm1, %ymm1 +; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_funnnel_v16i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsrlw $9, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: splatconstant_funnnel_v16i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsrlw $9, %ymm1, %ymm1 +; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: retq +; +; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i16: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpsrlw $9, %ymm1, %ymm1 +; AVX512VBMI2-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLBW-LABEL: splatconstant_funnnel_v16i16: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpsrlw $9, %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i16: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpshldw $7, %ymm1, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_funnnel_v16i16: ; XOPAVX1: # %bb.0: |