summaryrefslogtreecommitdiffstats
path: root/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
diff options
context:
space:
mode:
authorGadi Haber <gadi.haber@intel.com>2017-06-27 15:05:13 +0000
committerGadi Haber <gadi.haber@intel.com>2017-06-27 15:05:13 +0000
commit13759a7ed62a362bc3d7455da8b96279e545cdc6 (patch)
tree2090b37ec2372cd62bbda9fca89ec5c157b44be7 /llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
parenta179d25b99fec680d2430a07b6a35254c548e298 (diff)
downloadbcm5719-llvm-13759a7ed62a362bc3d7455da8b96279e545cdc6.tar.gz
bcm5719-llvm-13759a7ed62a362bc3d7455da8b96279e545cdc6.zip
Updated and extended the information about each instruction in HSW and SNB to include the following data:
•static latency •number of uOps from which the instructions consists •all ports used by the instruction Reviewers:  RKSimon zvi aymanmus m_zuckerman Differential Revision: https://reviews.llvm.org/D33897 llvm-svn: 306414
Diffstat (limited to 'llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll')
-rw-r--r--llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll1480
1 files changed, 738 insertions, 742 deletions
diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
index 2b89373ceb0..d56c4675b73 100644
--- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
@@ -1685,8 +1685,6 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: .cfi_offset %esi, -12
; AVX512F-32-NEXT: .Lcfi9:
; AVX512F-32-NEXT: .cfi_offset %ebx, -8
-; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm6
-; AVX512F-32-NEXT: vmovdqa64 %zmm0, %zmm5
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $5, %al
@@ -1707,39 +1705,39 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
; AVX512F-32-NEXT: kmovd %ecx, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: kmovd %edx, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
; AVX512F-32-NEXT: vpslld $24, %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: kmovd %ebx, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
; AVX512F-32-NEXT: vpbroadcastd %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: kmovd %eax, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
; AVX512F-32-NEXT: vpsllq $40, %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $6, %al
@@ -1748,8 +1746,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
@@ -1758,8 +1756,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %ch, %al
@@ -1767,8 +1765,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastq %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: andb $2, %al
@@ -1777,8 +1775,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %ch, %dl
@@ -1789,8 +1787,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %al
@@ -1798,8 +1796,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -1809,8 +1807,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -1820,8 +1818,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -1831,8 +1829,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -1842,8 +1840,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -1852,8 +1850,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
@@ -1864,8 +1862,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %bl
@@ -1877,8 +1875,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %dl
@@ -1887,8 +1885,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
@@ -1898,8 +1896,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
@@ -1910,8 +1908,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
@@ -1921,8 +1919,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> %EAX<def>
@@ -1932,8 +1930,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -1942,8 +1940,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
@@ -1952,444 +1950,444 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm3
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm3, %k0
; AVX512F-32-NEXT: movb %al, %dl
; AVX512F-32-NEXT: andb $15, %dl
; AVX512F-32-NEXT: movb %dl, %al
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3
+; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm4
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm5
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm4
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm4, %k0
; AVX512F-32-NEXT: shrb $3, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm5
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm6
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm6, %ymm5, %ymm5
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $28, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5
+; AVX512F-32-NEXT: vpbroadcastd %xmm5, %xmm5
+; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm6
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm7
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm6
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm6, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: movl %ecx, %esi
; AVX512F-32-NEXT: shrl $29, %eax
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm6
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm7
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm2, %ymm7, %ymm7
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %esi, %eax
; AVX512F-32-NEXT: shrl $30, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
-; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %esi, %eax
; AVX512F-32-NEXT: shrl $31, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; AVX512F-32-NEXT: kmovd %ecx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm7
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm7, %ymm1
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: andb $2, %al
; AVX512F-32-NEXT: shrb %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %dl
; AVX512F-32-NEXT: andb $15, %dl
; AVX512F-32-NEXT: movb %dl, %al
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $4, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $5, %al
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllq $40, %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $6, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $7, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %ch, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastq %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastq %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: andb $2, %al
; AVX512F-32-NEXT: shrb %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %ch, %dl
; AVX512F-32-NEXT: andb $15, %dl
; AVX512F-32-NEXT: movb %dl, %al
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: andl $61440, %eax # imm = 0xF000
; AVX512F-32-NEXT: shrl $12, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $13, %eax
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: andl $49152, %eax # imm = 0xC000
; AVX512F-32-NEXT: shrl $14, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: andl $32768, %eax # imm = 0x8000
; AVX512F-32-NEXT: shrl $15, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %ebx
; AVX512F-32-NEXT: shrl $16, %ebx
; AVX512F-32-NEXT: kmovd %ebx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %bl, %dl
; AVX512F-32-NEXT: andb $2, %dl
; AVX512F-32-NEXT: shrb %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-32-NEXT: movb %bl, %al
; AVX512F-32-NEXT: andb $15, %al
; AVX512F-32-NEXT: movb %al, %dl
; AVX512F-32-NEXT: shrb $2, %al
; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %bl, %al
; AVX512F-32-NEXT: shrb $4, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %bl, %al
; AVX512F-32-NEXT: shrb $5, %al
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllq $40, %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %bl, %al
; AVX512F-32-NEXT: shrb $6, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: # kill: %BL<def> %BL<kill> %EBX<kill> %EBX<def>
; AVX512F-32-NEXT: shrb $7, %bl
; AVX512F-32-NEXT: kmovd %ebx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $24, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastq %xmm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
; AVX512F-32-NEXT: andb $2, %dl
; AVX512F-32-NEXT: shrb %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-32-NEXT: movb %al, %dl
; AVX512F-32-NEXT: andb $15, %dl
; AVX512F-32-NEXT: movb %dl, %al
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
; AVX512F-32-NEXT: shrb $3, %al
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: kmovd %eax, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
+; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: shrl $28, %eax
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm4
-; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm4, %ymm1, %ymm1
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT: vpbroadcastd %xmm4, %xmm4
+; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $29, %eax
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: shrl $28, %eax
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
-; AVX512F-32-NEXT: vpbroadcastd %xmm3, %xmm3
-; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $30, %eax
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3
-; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT: vpbroadcastw %xmm4, %xmm4
+; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm3
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $31, %eax
; AVX512F-32-NEXT: kshiftlq $1, %k0, %k0
@@ -2397,12 +2395,12 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1
; AVX512F-32-NEXT: korq %k1, %k0, %k1
-; AVX512F-32-NEXT: vpcmpeqb %zmm6, %zmm5, %k0 {%k1}
-; AVX512F-32-NEXT: vpcmpgtb %zmm5, %zmm6, %k2 {%k1}
-; AVX512F-32-NEXT: vpcmpleb %zmm6, %zmm5, %k3 {%k1}
-; AVX512F-32-NEXT: vpcmpneqb %zmm6, %zmm5, %k4 {%k1}
-; AVX512F-32-NEXT: vpcmpleb %zmm5, %zmm6, %k5 {%k1}
-; AVX512F-32-NEXT: vpcmpgtb %zmm6, %zmm5, %k1 {%k1}
+; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: vpcmpgtb %zmm0, %zmm1, %k2 {%k1}
+; AVX512F-32-NEXT: vpcmpleb %zmm1, %zmm0, %k3 {%k1}
+; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k4 {%k1}
+; AVX512F-32-NEXT: vpcmpleb %zmm0, %zmm1, %k5 {%k1}
+; AVX512F-32-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 {%k1}
; AVX512F-32-NEXT: kmovq %k0, (%esp)
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -2571,8 +2569,6 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: .cfi_offset %esi, -12
; AVX512F-32-NEXT: .Lcfi15:
; AVX512F-32-NEXT: .cfi_offset %ebx, -8
-; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm6
-; AVX512F-32-NEXT: vmovdqa64 %zmm0, %zmm5
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $5, %al
@@ -2593,39 +2589,39 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
; AVX512F-32-NEXT: kmovd %ecx, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: kmovd %edx, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
; AVX512F-32-NEXT: vpslld $24, %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: kmovd %ebx, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
; AVX512F-32-NEXT: vpbroadcastd %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: kmovd %eax, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
; AVX512F-32-NEXT: vpsllq $40, %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $6, %al
@@ -2634,8 +2630,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
@@ -2644,8 +2640,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %ch, %al
@@ -2653,8 +2649,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastq %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: andb $2, %al
@@ -2663,8 +2659,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %ch, %dl
@@ -2675,8 +2671,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %al
@@ -2684,8 +2680,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -2695,8 +2691,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -2706,8 +2702,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -2717,8 +2713,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -2728,8 +2724,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -2738,8 +2734,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
@@ -2750,8 +2746,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %bl
@@ -2763,8 +2759,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %dl
@@ -2773,8 +2769,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
@@ -2784,8 +2780,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
@@ -2796,8 +2792,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
@@ -2807,8 +2803,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> %EAX<def>
@@ -2818,8 +2814,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -2828,8 +2824,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
@@ -2838,444 +2834,444 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm3
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm3, %k0
; AVX512F-32-NEXT: movb %al, %dl
; AVX512F-32-NEXT: andb $15, %dl
; AVX512F-32-NEXT: movb %dl, %al
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3
+; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm4
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm5
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm4
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm4, %k0
; AVX512F-32-NEXT: shrb $3, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm5
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm6
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm6, %ymm5, %ymm5
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $28, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5
+; AVX512F-32-NEXT: vpbroadcastd %xmm5, %xmm5
+; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm6
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm7
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm6
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm6, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: movl %ecx, %esi
; AVX512F-32-NEXT: shrl $29, %eax
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm6
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm7
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm2, %ymm7, %ymm7
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %esi, %eax
; AVX512F-32-NEXT: shrl $30, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
-; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %esi, %eax
; AVX512F-32-NEXT: shrl $31, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; AVX512F-32-NEXT: kmovd %ecx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm7
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm7, %ymm1
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: andb $2, %al
; AVX512F-32-NEXT: shrb %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %dl
; AVX512F-32-NEXT: andb $15, %dl
; AVX512F-32-NEXT: movb %dl, %al
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $4, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $5, %al
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllq $40, %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $6, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $7, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %ch, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastq %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastq %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: andb $2, %al
; AVX512F-32-NEXT: shrb %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %ch, %dl
; AVX512F-32-NEXT: andb $15, %dl
; AVX512F-32-NEXT: movb %dl, %al
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: andl $61440, %eax # imm = 0xF000
; AVX512F-32-NEXT: shrl $12, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $13, %eax
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: andl $49152, %eax # imm = 0xC000
; AVX512F-32-NEXT: shrl $14, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: andl $32768, %eax # imm = 0x8000
; AVX512F-32-NEXT: shrl $15, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %ebx
; AVX512F-32-NEXT: shrl $16, %ebx
; AVX512F-32-NEXT: kmovd %ebx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %bl, %dl
; AVX512F-32-NEXT: andb $2, %dl
; AVX512F-32-NEXT: shrb %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-32-NEXT: movb %bl, %al
; AVX512F-32-NEXT: andb $15, %al
; AVX512F-32-NEXT: movb %al, %dl
; AVX512F-32-NEXT: shrb $2, %al
; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %bl, %al
; AVX512F-32-NEXT: shrb $4, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %bl, %al
; AVX512F-32-NEXT: shrb $5, %al
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllq $40, %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %bl, %al
; AVX512F-32-NEXT: shrb $6, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: # kill: %BL<def> %BL<kill> %EBX<kill> %EBX<def>
; AVX512F-32-NEXT: shrb $7, %bl
; AVX512F-32-NEXT: kmovd %ebx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $24, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastq %xmm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
; AVX512F-32-NEXT: andb $2, %dl
; AVX512F-32-NEXT: shrb %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-32-NEXT: movb %al, %dl
; AVX512F-32-NEXT: andb $15, %dl
; AVX512F-32-NEXT: movb %dl, %al
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
; AVX512F-32-NEXT: shrb $3, %al
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: kmovd %eax, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
+; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: shrl $28, %eax
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm4
-; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm4, %ymm1, %ymm1
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT: vpbroadcastd %xmm4, %xmm4
+; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $29, %eax
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: shrl $28, %eax
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
-; AVX512F-32-NEXT: vpbroadcastd %xmm3, %xmm3
-; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $30, %eax
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3
-; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT: vpbroadcastw %xmm4, %xmm4
+; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm3
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $31, %eax
; AVX512F-32-NEXT: kshiftlq $1, %k0, %k0
@@ -3283,12 +3279,12 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1
; AVX512F-32-NEXT: korq %k1, %k0, %k1
-; AVX512F-32-NEXT: vpcmpeqb %zmm6, %zmm5, %k0 {%k1}
-; AVX512F-32-NEXT: vpcmpltub %zmm6, %zmm5, %k2 {%k1}
-; AVX512F-32-NEXT: vpcmpleub %zmm6, %zmm5, %k3 {%k1}
-; AVX512F-32-NEXT: vpcmpneqb %zmm6, %zmm5, %k4 {%k1}
-; AVX512F-32-NEXT: vpcmpnltub %zmm6, %zmm5, %k5 {%k1}
-; AVX512F-32-NEXT: vpcmpnleub %zmm6, %zmm5, %k1 {%k1}
+; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: vpcmpltub %zmm1, %zmm0, %k2 {%k1}
+; AVX512F-32-NEXT: vpcmpleub %zmm1, %zmm0, %k3 {%k1}
+; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k4 {%k1}
+; AVX512F-32-NEXT: vpcmpnltub %zmm1, %zmm0, %k5 {%k1}
+; AVX512F-32-NEXT: vpcmpnleub %zmm1, %zmm0, %k1 {%k1}
; AVX512F-32-NEXT: kmovq %k0, (%esp)
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
OpenPOWER on IntegriCloud