diff options
| author | Gadi Haber <gadi.haber@intel.com> | 2017-06-27 15:05:13 +0000 |
|---|---|---|
| committer | Gadi Haber <gadi.haber@intel.com> | 2017-06-27 15:05:13 +0000 |
| commit | 13759a7ed62a362bc3d7455da8b96279e545cdc6 (patch) | |
| tree | 2090b37ec2372cd62bbda9fca89ec5c157b44be7 /llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll | |
| parent | a179d25b99fec680d2430a07b6a35254c548e298 (diff) | |
| download | bcm5719-llvm-13759a7ed62a362bc3d7455da8b96279e545cdc6.tar.gz bcm5719-llvm-13759a7ed62a362bc3d7455da8b96279e545cdc6.zip | |
Updated and extended the information about each instruction in HSW and SNB to include the following data:
•static latency
•number of uOps from which the instructions consists
•all ports used by the instruction
Reviewers:
RKSimon
zvi
aymanmus
m_zuckerman
Differential Revision: https://reviews.llvm.org/D33897
llvm-svn: 306414
Diffstat (limited to 'llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll')
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll | 1480 |
1 files changed, 738 insertions, 742 deletions
diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll index 2b89373ceb0..d56c4675b73 100644 --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -1685,8 +1685,6 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: .cfi_offset %esi, -12 ; AVX512F-32-NEXT: .Lcfi9: ; AVX512F-32-NEXT: .cfi_offset %ebx, -8 -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512F-32-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; AVX512F-32-NEXT: movb %cl, %al ; AVX512F-32-NEXT: shrb $5, %al @@ -1707,39 +1705,39 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2 ; AVX512F-32-NEXT: kmovd %ecx, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 ; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: kmovd %edx, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 ; AVX512F-32-NEXT: vpslld $24, %xmm3, %xmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: kmovd %ebx, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 ; AVX512F-32-NEXT: vpbroadcastd %xmm3, %xmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: kmovd %eax, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 ; AVX512F-32-NEXT: vpsllq $40, %xmm3, %xmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-32-NEXT: movb %cl, %al ; AVX512F-32-NEXT: shrb $6, %al @@ -1748,8 +1746,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %cl, %al @@ -1758,8 +1756,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %ch, %al @@ -1767,8 +1765,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: andb $2, %al @@ -1777,8 +1775,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6] ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %ch, %dl @@ -1789,8 +1787,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: shrb $3, %al @@ -1798,8 +1796,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4] ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax @@ -1809,8 +1807,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax @@ -1820,8 +1818,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax @@ -1831,8 +1829,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax @@ -1842,8 +1840,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax @@ -1852,8 +1850,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %al, %dl @@ -1864,8 +1862,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %al, %bl @@ -1877,8 +1875,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: shrb $3, %dl @@ -1887,8 +1885,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %al, %dl @@ -1898,8 +1896,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %al, %dl @@ -1910,8 +1908,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %al, %dl @@ -1921,8 +1919,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> %EAX<def> @@ -1932,8 +1930,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax @@ -1942,8 +1940,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %al, %dl @@ -1952,444 +1950,444 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: kmovd %edx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6] -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm3 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm3, %k0 ; AVX512F-32-NEXT: movb %al, %dl ; AVX512F-32-NEXT: andb $15, %dl ; AVX512F-32-NEXT: movb %dl, %al ; AVX512F-32-NEXT: shrb $2, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 +; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3 +; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm4 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm5 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm4 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm4, %k0 ; AVX512F-32-NEXT: shrb $3, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4] -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4] +; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm5 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm6 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm6, %ymm5, %ymm5 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $28, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5 +; AVX512F-32-NEXT: vpbroadcastd %xmm5, %xmm5 +; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm6 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm7 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm6 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm6, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: movl %ecx, %esi ; AVX512F-32-NEXT: shrl $29, %eax ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm2 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm6 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2] +; AVX512F-32-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm7 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm2, %ymm7, %ymm7 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %esi, %eax ; AVX512F-32-NEXT: shrl $30, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] -; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm1 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %esi, %eax ; AVX512F-32-NEXT: shrl $31, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; AVX512F-32-NEXT: kmovd %ecx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm7 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm7, %ymm1 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %cl, %al ; AVX512F-32-NEXT: andb $2, %al ; AVX512F-32-NEXT: shrb %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %cl, %dl ; AVX512F-32-NEXT: andb $15, %dl ; AVX512F-32-NEXT: movb %dl, %al ; AVX512F-32-NEXT: shrb $2, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: shrb $3, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %cl, %al ; AVX512F-32-NEXT: shrb $4, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %cl, %al ; AVX512F-32-NEXT: shrb $5, %al ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpsllq $40, %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %cl, %al ; AVX512F-32-NEXT: shrb $6, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %cl, %al ; AVX512F-32-NEXT: shrb $7, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %ch, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastq %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: andb $2, %al ; AVX512F-32-NEXT: shrb %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6] +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %ch, %dl ; AVX512F-32-NEXT: andb $15, %dl ; AVX512F-32-NEXT: movb %dl, %al ; AVX512F-32-NEXT: shrb $2, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: shrb $3, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4] -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4] +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: andl $61440, %eax # imm = 0xF000 ; AVX512F-32-NEXT: shrl $12, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $13, %eax ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: andl $49152, %eax # imm = 0xC000 ; AVX512F-32-NEXT: shrl $14, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: andl $32768, %eax # imm = 0x8000 ; AVX512F-32-NEXT: shrl $15, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %ebx ; AVX512F-32-NEXT: shrl $16, %ebx ; AVX512F-32-NEXT: kmovd %ebx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %bl, %dl ; AVX512F-32-NEXT: andb $2, %dl ; AVX512F-32-NEXT: shrb %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: movb %bl, %al ; AVX512F-32-NEXT: andb $15, %al ; AVX512F-32-NEXT: movb %al, %dl ; AVX512F-32-NEXT: shrb $2, %al ; AVX512F-32-NEXT: kmovd %eax, %k0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 -; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 +; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: shrb $3, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %bl, %al ; AVX512F-32-NEXT: shrb $4, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %bl, %al ; AVX512F-32-NEXT: shrb $5, %al ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpsllq $40, %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %bl, %al ; AVX512F-32-NEXT: shrb $6, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: # kill: %BL<def> %BL<kill> %EBX<kill> %EBX<def> ; AVX512F-32-NEXT: shrb $7, %bl ; AVX512F-32-NEXT: kmovd %ebx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $24, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %al, %dl ; AVX512F-32-NEXT: andb $2, %dl ; AVX512F-32-NEXT: shrb %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6] +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: movb %al, %dl ; AVX512F-32-NEXT: andb $15, %dl ; AVX512F-32-NEXT: movb %dl, %al ; AVX512F-32-NEXT: shrb $2, %dl ; AVX512F-32-NEXT: kmovd %edx, %k0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 -; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 +; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: shrb $3, %al -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 +; AVX512F-32-NEXT: kmovd %eax, %k0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4] +; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrl $28, %eax +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; AVX512F-32-NEXT: kmovd %eax, %k0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] -; AVX512F-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm4 -; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm4, %ymm1, %ymm1 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 +; AVX512F-32-NEXT: vpbroadcastd %xmm4, %xmm4 +; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $29, %eax ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: shrl $28, %eax -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 -; AVX512F-32-NEXT: vpbroadcastd %xmm3, %xmm3 -; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2] -; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2] +; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $30, %eax -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; AVX512F-32-NEXT: kmovd %eax, %k0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3 -; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 +; AVX512F-32-NEXT: vpbroadcastw %xmm4, %xmm4 +; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm3 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $31, %eax ; AVX512F-32-NEXT: kshiftlq $1, %k0, %k0 @@ -2397,12 +2395,12 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1 ; AVX512F-32-NEXT: korq %k1, %k0, %k1 -; AVX512F-32-NEXT: vpcmpeqb %zmm6, %zmm5, %k0 {%k1} -; AVX512F-32-NEXT: vpcmpgtb %zmm5, %zmm6, %k2 {%k1} -; AVX512F-32-NEXT: vpcmpleb %zmm6, %zmm5, %k3 {%k1} -; AVX512F-32-NEXT: vpcmpneqb %zmm6, %zmm5, %k4 {%k1} -; AVX512F-32-NEXT: vpcmpleb %zmm5, %zmm6, %k5 {%k1} -; AVX512F-32-NEXT: vpcmpgtb %zmm6, %zmm5, %k1 {%k1} +; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} +; AVX512F-32-NEXT: vpcmpgtb %zmm0, %zmm1, %k2 {%k1} +; AVX512F-32-NEXT: vpcmpleb %zmm1, %zmm0, %k3 {%k1} +; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k4 {%k1} +; AVX512F-32-NEXT: vpcmpleb %zmm0, %zmm1, %k5 {%k1} +; AVX512F-32-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 {%k1} ; AVX512F-32-NEXT: kmovq %k0, (%esp) ; AVX512F-32-NEXT: movl (%esp), %eax ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -2571,8 +2569,6 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: .cfi_offset %esi, -12 ; AVX512F-32-NEXT: .Lcfi15: ; AVX512F-32-NEXT: .cfi_offset %ebx, -8 -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512F-32-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; AVX512F-32-NEXT: movb %cl, %al ; AVX512F-32-NEXT: shrb $5, %al @@ -2593,39 +2589,39 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2 ; AVX512F-32-NEXT: kmovd %ecx, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 ; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: kmovd %edx, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 ; AVX512F-32-NEXT: vpslld $24, %xmm3, %xmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: kmovd %ebx, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 ; AVX512F-32-NEXT: vpbroadcastd %xmm3, %xmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: kmovd %eax, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 ; AVX512F-32-NEXT: vpsllq $40, %xmm3, %xmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-32-NEXT: movb %cl, %al ; AVX512F-32-NEXT: shrb $6, %al @@ -2634,8 +2630,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %cl, %al @@ -2644,8 +2640,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %ch, %al @@ -2653,8 +2649,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: andb $2, %al @@ -2663,8 +2659,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6] ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %ch, %dl @@ -2675,8 +2671,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: shrb $3, %al @@ -2684,8 +2680,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4] ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax @@ -2695,8 +2691,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax @@ -2706,8 +2702,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax @@ -2717,8 +2713,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax @@ -2728,8 +2724,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax @@ -2738,8 +2734,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %al, %dl @@ -2750,8 +2746,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %al, %bl @@ -2763,8 +2759,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: shrb $3, %dl @@ -2773,8 +2769,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %al, %dl @@ -2784,8 +2780,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %al, %dl @@ -2796,8 +2792,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %al, %dl @@ -2807,8 +2803,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> %EAX<def> @@ -2818,8 +2814,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax @@ -2828,8 +2824,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %al, %dl @@ -2838,444 +2834,444 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: kmovd %edx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6] -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm3 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm3, %k0 ; AVX512F-32-NEXT: movb %al, %dl ; AVX512F-32-NEXT: andb $15, %dl ; AVX512F-32-NEXT: movb %dl, %al ; AVX512F-32-NEXT: shrb $2, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 +; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3 +; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm4 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm5 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm4 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm4, %k0 ; AVX512F-32-NEXT: shrb $3, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4] -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4] +; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm5 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm6 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm6, %ymm5, %ymm5 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $28, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5 +; AVX512F-32-NEXT: vpbroadcastd %xmm5, %xmm5 +; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm6 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm7 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm6 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm6, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: movl %ecx, %esi ; AVX512F-32-NEXT: shrl $29, %eax ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm2 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm6 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2] +; AVX512F-32-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm7 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm2, %ymm7, %ymm7 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %esi, %eax ; AVX512F-32-NEXT: shrl $30, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] -; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm1 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %esi, %eax ; AVX512F-32-NEXT: shrl $31, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; AVX512F-32-NEXT: kmovd %ecx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm7 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm7, %ymm1 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %cl, %al ; AVX512F-32-NEXT: andb $2, %al ; AVX512F-32-NEXT: shrb %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %cl, %dl ; AVX512F-32-NEXT: andb $15, %dl ; AVX512F-32-NEXT: movb %dl, %al ; AVX512F-32-NEXT: shrb $2, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: shrb $3, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %cl, %al ; AVX512F-32-NEXT: shrb $4, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %cl, %al ; AVX512F-32-NEXT: shrb $5, %al ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpsllq $40, %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %cl, %al ; AVX512F-32-NEXT: shrb $6, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %cl, %al ; AVX512F-32-NEXT: shrb $7, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %ch, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastq %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: andb $2, %al ; AVX512F-32-NEXT: shrb %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6] +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %ch, %dl ; AVX512F-32-NEXT: andb $15, %dl ; AVX512F-32-NEXT: movb %dl, %al ; AVX512F-32-NEXT: shrb $2, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: shrb $3, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4] -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4] +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: andl $61440, %eax # imm = 0xF000 ; AVX512F-32-NEXT: shrl $12, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $13, %eax ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: andl $49152, %eax # imm = 0xC000 ; AVX512F-32-NEXT: shrl $14, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: andl $32768, %eax # imm = 0x8000 ; AVX512F-32-NEXT: shrl $15, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %ebx ; AVX512F-32-NEXT: shrl $16, %ebx ; AVX512F-32-NEXT: kmovd %ebx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %bl, %dl ; AVX512F-32-NEXT: andb $2, %dl ; AVX512F-32-NEXT: shrb %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: movb %bl, %al ; AVX512F-32-NEXT: andb $15, %al ; AVX512F-32-NEXT: movb %al, %dl ; AVX512F-32-NEXT: shrb $2, %al ; AVX512F-32-NEXT: kmovd %eax, %k0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 -; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 +; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: shrb $3, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %bl, %al ; AVX512F-32-NEXT: shrb $4, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %bl, %al ; AVX512F-32-NEXT: shrb $5, %al ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpsllq $40, %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %bl, %al ; AVX512F-32-NEXT: shrb $6, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: # kill: %BL<def> %BL<kill> %EBX<kill> %EBX<def> ; AVX512F-32-NEXT: shrb $7, %bl ; AVX512F-32-NEXT: kmovd %ebx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $24, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %al, %dl ; AVX512F-32-NEXT: andb $2, %dl ; AVX512F-32-NEXT: shrb %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6] +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: movb %al, %dl ; AVX512F-32-NEXT: andb $15, %dl ; AVX512F-32-NEXT: movb %dl, %al ; AVX512F-32-NEXT: shrb $2, %dl ; AVX512F-32-NEXT: kmovd %edx, %k0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 -; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 +; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: shrb $3, %al -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 +; AVX512F-32-NEXT: kmovd %eax, %k0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4] +; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrl $28, %eax +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; AVX512F-32-NEXT: kmovd %eax, %k0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] -; AVX512F-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm4 -; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm4, %ymm1, %ymm1 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 +; AVX512F-32-NEXT: vpbroadcastd %xmm4, %xmm4 +; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $29, %eax ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: shrl $28, %eax -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 -; AVX512F-32-NEXT: vpbroadcastd %xmm3, %xmm3 -; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2] -; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2] +; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $30, %eax -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; AVX512F-32-NEXT: kmovd %eax, %k0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3 -; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 +; AVX512F-32-NEXT: vpbroadcastw %xmm4, %xmm4 +; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm3 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $31, %eax ; AVX512F-32-NEXT: kshiftlq $1, %k0, %k0 @@ -3283,12 +3279,12 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1 ; AVX512F-32-NEXT: korq %k1, %k0, %k1 -; AVX512F-32-NEXT: vpcmpeqb %zmm6, %zmm5, %k0 {%k1} -; AVX512F-32-NEXT: vpcmpltub %zmm6, %zmm5, %k2 {%k1} -; AVX512F-32-NEXT: vpcmpleub %zmm6, %zmm5, %k3 {%k1} -; AVX512F-32-NEXT: vpcmpneqb %zmm6, %zmm5, %k4 {%k1} -; AVX512F-32-NEXT: vpcmpnltub %zmm6, %zmm5, %k5 {%k1} -; AVX512F-32-NEXT: vpcmpnleub %zmm6, %zmm5, %k1 {%k1} +; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} +; AVX512F-32-NEXT: vpcmpltub %zmm1, %zmm0, %k2 {%k1} +; AVX512F-32-NEXT: vpcmpleub %zmm1, %zmm0, %k3 {%k1} +; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k4 {%k1} +; AVX512F-32-NEXT: vpcmpnltub %zmm1, %zmm0, %k5 {%k1} +; AVX512F-32-NEXT: vpcmpnleub %zmm1, %zmm0, %k1 {%k1} ; AVX512F-32-NEXT: kmovq %k0, (%esp) ; AVX512F-32-NEXT: movl (%esp), %eax ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx |

