summaryrefslogtreecommitdiffstats
path: root/llvm/test
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test')
-rw-r--r--llvm/test/CodeGen/X86/avx512-vec-cmp.ll4
-rw-r--r--llvm/test/CodeGen/X86/kshift.ll116
-rw-r--r--llvm/test/CodeGen/X86/movmsk-cmp.ll36
-rw-r--r--llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll72
-rw-r--r--llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll28
-rw-r--r--llvm/test/CodeGen/X86/setcc-lowering.ll3
-rw-r--r--llvm/test/CodeGen/X86/vector-fshl-128.ll494
-rw-r--r--llvm/test/CodeGen/X86/vector-fshl-256.ll488
-rw-r--r--llvm/test/CodeGen/X86/vector-fshl-512.ll632
-rw-r--r--llvm/test/CodeGen/X86/vector-fshr-128.ll494
-rw-r--r--llvm/test/CodeGen/X86/vector-fshr-256.ll488
-rw-r--r--llvm/test/CodeGen/X86/vector-fshr-512.ll560
-rw-r--r--llvm/test/CodeGen/X86/vector-lzcnt-512.ll76
13 files changed, 1645 insertions, 1846 deletions
diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
index 5b9b4005c95..2c7d63d5ab9 100644
--- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -853,8 +853,8 @@ define <8 x double> @test43(<8 x double> %x, <8 x double> %x1, double* %ptr,<8 x
; KNL-NEXT: vpmovzxwq %xmm2, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x34,0xd2]
; KNL-NEXT: ## zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
; KNL-NEXT: vpsllq $63, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0xed,0x48,0x73,0xf2,0x3f]
-; KNL-NEXT: vptestmq %zmm2, %zmm2, %k1 ## encoding: [0x62,0xf2,0xed,0x48,0x27,0xca]
-; KNL-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x59,0xc2,0x0f,0x01]
+; KNL-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 ## encoding: [0x62,0xf1,0xfd,0x58,0xc2,0x0f,0x01]
+; KNL-NEXT: vptestmq %zmm2, %zmm2, %k1 {%k1} ## encoding: [0x62,0xf2,0xed,0x49,0x27,0xca]
; KNL-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0x65,0xc0]
; KNL-NEXT: retq ## encoding: [0xc3]
;
diff --git a/llvm/test/CodeGen/X86/kshift.ll b/llvm/test/CodeGen/X86/kshift.ll
index aaa5ff7567d..a12fd46b5af 100644
--- a/llvm/test/CodeGen/X86/kshift.ll
+++ b/llvm/test/CodeGen/X86/kshift.ll
@@ -10,8 +10,8 @@ define i8 @kshiftl_v8i1_1(<8 x i64> %x, <8 x i64> %y) {
; KNL-NEXT: movb $-2, %al
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0 {%k1}
+; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k1
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: # kill: def $al killed $al killed $eax
; KNL-NEXT: vzeroupper
@@ -75,15 +75,15 @@ define i32 @kshiftl_v32i1_1(<32 x i16> %x, <32 x i16> %y) {
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm0[15],zmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; KNL-NEXT: kshiftlw $1, %k2, %k2
-; KNL-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm0
-; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT: kshiftlw $1, %k2, %k1
+; KNL-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm1
+; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k2
; KNL-NEXT: vpcmpeqw %ymm4, %ymm2, %ymm1
; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k2}
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k1}
; KNL-NEXT: kmovw %k0, %ecx
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k2}
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: shll $16, %eax
; KNL-NEXT: orl %ecx, %eax
@@ -112,38 +112,38 @@ define i64 @kshiftl_v64i1_1(<64 x i8> %x, <64 x i8> %y) {
; KNL-NEXT: vpxor %xmm4, %xmm4, %xmm4
; KNL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm5
-; KNL-NEXT: vptestmd %zmm5, %zmm5, %k3
+; KNL-NEXT: vptestmd %zmm5, %zmm5, %k1
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k2
; KNL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm0
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k3
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k4
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z}
-; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k3} {z}
; KNL-NEXT: valignd {{.*#+}} zmm1 = zmm0[15],zmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1
-; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
-; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm1[15],zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k2
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z}
-; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm0[15],zmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k4
-; KNL-NEXT: kshiftlw $1, %k3, %k3
-; KNL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm0
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z}
+; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm5[15],zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; KNL-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z}
+; KNL-NEXT: valignd {{.*#+}} zmm5 = zmm6[15],zmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; KNL-NEXT: kshiftlw $1, %k1, %k3
+; KNL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm3
+; KNL-NEXT: vextracti128 $1, %ymm3, %xmm6
+; KNL-NEXT: vpmovsxbd %xmm6, %zmm6
+; KNL-NEXT: vptestmd %zmm6, %zmm6, %k1
+; KNL-NEXT: vpmovsxbd %xmm3, %zmm3
+; KNL-NEXT: vptestmd %zmm3, %zmm3, %k2
; KNL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2
; KNL-NEXT: vextracti128 $1, %ymm2, %xmm3
; KNL-NEXT: vpmovsxbd %xmm3, %zmm3
+; KNL-NEXT: vptestmd %zmm3, %zmm3, %k4
; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0 {%k3}
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vptestmd %zmm3, %zmm3, %k0 {%k4}
+; KNL-NEXT: vptestmd %zmm5, %zmm5, %k0 {%k4}
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: shll $16, %ecx
; KNL-NEXT: orl %eax, %ecx
@@ -182,8 +182,8 @@ define i8 @kshiftl_v8i1_7(<8 x i64> %x, <8 x i64> %y) {
; KNL-NEXT: movb $-128, %al
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0 {%k1}
+; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k1
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: # kill: def $al killed $al killed $eax
; KNL-NEXT: vzeroupper
@@ -308,8 +308,8 @@ define i8 @kshiftr_v8i1_1(<8 x i64> %x, <8 x i64> %y) {
; KNL-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,2,3,4,5,6,7,15]
; KNL-NEXT: vpermi2q %zmm0, %zmm2, %zmm3
-; KNL-NEXT: vptestmq %zmm3, %zmm3, %k1
-; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0 {%k1}
+; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k1
+; KNL-NEXT: vptestmq %zmm3, %zmm3, %k0 {%k1}
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: # kill: def $al killed $al killed $eax
; KNL-NEXT: vzeroupper
@@ -373,10 +373,10 @@ define i32 @kshiftr_v32i1_1(<32 x i16> %x, <32 x i16> %y) {
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0]
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k2
; KNL-NEXT: kshiftrw $1, %k1, %k1
-; KNL-NEXT: vpcmpeqw %ymm4, %ymm2, %ymm0
-; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT: vpcmpeqw %ymm4, %ymm2, %ymm1
+; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k2
; KNL-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm1
; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k1}
@@ -411,44 +411,44 @@ define i64 @kshiftr_v64i1_1(<64 x i8> %x, <64 x i8> %y) {
; KNL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm1
; KNL-NEXT: vextracti128 $1, %ymm1, %xmm5
; KNL-NEXT: vpmovsxbd %xmm5, %zmm5
-; KNL-NEXT: vptestmd %zmm5, %zmm5, %k3
+; KNL-NEXT: vptestmd %zmm5, %zmm5, %k1
; KNL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm5
; KNL-NEXT: vptestmd %zmm5, %zmm5, %k2
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k3
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k4
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z}
-; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k3} {z}
; KNL-NEXT: valignd {{.*#+}} zmm5 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0]
-; KNL-NEXT: vptestmd %zmm5, %zmm5, %k1
-; KNL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z}
-; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm5[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0]
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k2
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z}
-; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0]
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k4
-; KNL-NEXT: kshiftrw $1, %k3, %k3
-; KNL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm0
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z}
+; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm6[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0]
+; KNL-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z}
+; KNL-NEXT: valignd {{.*#+}} zmm1 = zmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm6[0]
+; KNL-NEXT: kshiftrw $1, %k1, %k3
+; KNL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2
+; KNL-NEXT: vextracti128 $1, %ymm2, %xmm6
+; KNL-NEXT: vpmovsxbd %xmm6, %zmm6
+; KNL-NEXT: vptestmd %zmm6, %zmm6, %k1
+; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL-NEXT: vptestmd %zmm2, %zmm2, %k2
; KNL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm2
; KNL-NEXT: vpmovsxbd %xmm2, %zmm3
+; KNL-NEXT: vptestmd %zmm3, %zmm3, %k4
; KNL-NEXT: vextracti128 $1, %ymm2, %xmm2
; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0 {%k3}
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: shll $16, %eax
-; KNL-NEXT: vptestmd %zmm3, %zmm3, %k0 {%k4}
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k4}
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: orl %eax, %ecx
; KNL-NEXT: shlq $32, %rcx
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k2}
; KNL-NEXT: kmovw %k0, %edx
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k1}
+; KNL-NEXT: vptestmd %zmm5, %zmm5, %k0 {%k1}
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: shll $16, %eax
; KNL-NEXT: orl %edx, %eax
@@ -480,8 +480,8 @@ define i8 @kshiftr_v8i1_7(<8 x i64> %x, <8 x i64> %y) {
; KNL-NEXT: movb $-2, %al
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0 {%k1}
+; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k1
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: # kill: def $al killed $al killed $eax
; KNL-NEXT: vzeroupper
@@ -605,8 +605,8 @@ define i8 @kshiftl_v8i1_zu123u56(<8 x i64> %x, <8 x i64> %y) {
; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = <8,u,1,2,3,u,5,6>
; KNL-NEXT: vpermi2q %zmm0, %zmm2, %zmm3
; KNL-NEXT: vpsllq $63, %zmm3, %zmm0
-; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0 {%k1}
+; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k1
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: # kill: def $al killed $al killed $eax
; KNL-NEXT: vzeroupper
@@ -636,8 +636,8 @@ define i8 @kshiftl_v8i1_u0123456(<8 x i64> %x, <8 x i64> %y) {
; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: valignq {{.*#+}} zmm0 = zmm0[7,0,1,2,3,4,5,6]
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
-; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0 {%k1}
+; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k1
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: # kill: def $al killed $al killed $eax
; KNL-NEXT: vzeroupper
@@ -669,8 +669,8 @@ define i8 @kshiftr_v8i1_1u3u567z(<8 x i64> %x, <8 x i64> %y) {
; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = <1,u,3,u,5,6,7,15>
; KNL-NEXT: vpermi2q %zmm0, %zmm2, %zmm3
; KNL-NEXT: vpsllq $63, %zmm3, %zmm0
-; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0 {%k1}
+; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k1
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: # kill: def $al killed $al killed $eax
; KNL-NEXT: vzeroupper
@@ -700,8 +700,8 @@ define i8 @kshiftr_v8i1_234567uu(<8 x i64> %x, <8 x i64> %y) {
; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,4,5,6,7,0,1]
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
-; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0 {%k1}
+; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k1
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: # kill: def $al killed $al killed $eax
; KNL-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll
index 01d5252c231..06862809786 100644
--- a/llvm/test/CodeGen/X86/movmsk-cmp.ll
+++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll
@@ -2087,8 +2087,7 @@ define i1 @allones_v4i32_and1(<4 x i32> %arg) {
; KNL-LABEL: allones_v4i32_and1:
; KNL: # %bb.0:
; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; KNL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
-; KNL-NEXT: vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: andb $15, %al
; KNL-NEXT: cmpb $15, %al
@@ -2131,8 +2130,7 @@ define i1 @allzeros_v4i32_and1(<4 x i32> %arg) {
; KNL-LABEL: allzeros_v4i32_and1:
; KNL: # %bb.0:
; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; KNL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
-; KNL-NEXT: vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testb $15, %al
; KNL-NEXT: sete %al
@@ -2192,8 +2190,7 @@ define i1 @allones_v8i32_and1(<8 x i32> %arg) {
; KNL-LABEL: allones_v8i32_and1:
; KNL: # %bb.0:
; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
-; KNL-NEXT: vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: cmpb $-1, %al
; KNL-NEXT: sete %al
@@ -2253,8 +2250,7 @@ define i1 @allzeros_v8i32_and1(<8 x i32> %arg) {
; KNL-LABEL: allzeros_v8i32_and1:
; KNL: # %bb.0:
; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
-; KNL-NEXT: vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testb %al, %al
; KNL-NEXT: sete %al
@@ -2573,8 +2569,7 @@ define i1 @allones_v4i64_and1(<4 x i64> %arg) {
; KNL-LABEL: allones_v4i64_and1:
; KNL: # %bb.0:
; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1]
-; KNL-NEXT: vptestmq %zmm1, %zmm0, %k0
+; KNL-NEXT: vptestmq {{.*}}(%rip){1to8}, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: andb $15, %al
; KNL-NEXT: cmpb $15, %al
@@ -2640,8 +2635,7 @@ define i1 @allzeros_v4i64_and1(<4 x i64> %arg) {
; KNL-LABEL: allzeros_v4i64_and1:
; KNL: # %bb.0:
; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1]
-; KNL-NEXT: vptestmq %zmm1, %zmm0, %k0
+; KNL-NEXT: vptestmq {{.*}}(%rip){1to8}, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testb $15, %al
; KNL-NEXT: sete %al
@@ -3686,8 +3680,7 @@ define i1 @allones_v4i32_and4(<4 x i32> %arg) {
; KNL-LABEL: allones_v4i32_and4:
; KNL: # %bb.0:
; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; KNL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4]
-; KNL-NEXT: vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: andb $15, %al
; KNL-NEXT: cmpb $15, %al
@@ -3730,8 +3723,7 @@ define i1 @allzeros_v4i32_and4(<4 x i32> %arg) {
; KNL-LABEL: allzeros_v4i32_and4:
; KNL: # %bb.0:
; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; KNL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4]
-; KNL-NEXT: vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testb $15, %al
; KNL-NEXT: sete %al
@@ -3791,8 +3783,7 @@ define i1 @allones_v8i32_and4(<8 x i32> %arg) {
; KNL-LABEL: allones_v8i32_and4:
; KNL: # %bb.0:
; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4]
-; KNL-NEXT: vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: cmpb $-1, %al
; KNL-NEXT: sete %al
@@ -3852,8 +3843,7 @@ define i1 @allzeros_v8i32_and4(<8 x i32> %arg) {
; KNL-LABEL: allzeros_v8i32_and4:
; KNL: # %bb.0:
; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4]
-; KNL-NEXT: vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testb %al, %al
; KNL-NEXT: sete %al
@@ -4172,8 +4162,7 @@ define i1 @allones_v4i64_and4(<4 x i64> %arg) {
; KNL-LABEL: allones_v4i64_and4:
; KNL: # %bb.0:
; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4,4,4,4]
-; KNL-NEXT: vptestmq %zmm1, %zmm0, %k0
+; KNL-NEXT: vptestmq {{.*}}(%rip){1to8}, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: andb $15, %al
; KNL-NEXT: cmpb $15, %al
@@ -4239,8 +4228,7 @@ define i1 @allzeros_v4i64_and4(<4 x i64> %arg) {
; KNL-LABEL: allzeros_v4i64_and4:
; KNL: # %bb.0:
; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4,4,4,4]
-; KNL-NEXT: vptestmq %zmm1, %zmm0, %k0
+; KNL-NEXT: vptestmq {{.*}}(%rip){1to8}, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testb $15, %al
; KNL-NEXT: sete %al
diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll b/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll
index b4d452f2d3e..2ee68e39b91 100644
--- a/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll
+++ b/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll
@@ -7,8 +7,8 @@
define <8 x i16> @testv8i1_sext_v8i16(<8 x i32>* %p) {
; AVX256-LABEL: testv8i1_sext_v8i16:
; AVX256: # %bb.0:
-; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
+; AVX256-NEXT: vmovdqa (%rdi), %ymm0
+; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1
; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX256-NEXT: vpmovdw %ymm0, %xmm0
@@ -17,8 +17,8 @@ define <8 x i16> @testv8i1_sext_v8i16(<8 x i32>* %p) {
;
; AVX512VL-LABEL: testv8i1_sext_v8i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
@@ -42,9 +42,10 @@ define <8 x i16> @testv8i1_sext_v8i16(<8 x i32>* %p) {
define <16 x i8> @testv16i1_sext_v16i8(<8 x i32>* %p, <8 x i32>* %q) {
; AVX256-LABEL: testv16i1_sext_v16i8:
; AVX256: # %bb.0:
-; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
-; AVX256-NEXT: vpcmpeqd (%rsi), %ymm0, %k2
+; AVX256-NEXT: vmovdqa (%rdi), %ymm0
+; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1
+; AVX256-NEXT: vmovdqa (%rsi), %ymm0
+; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2
; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z}
; AVX256-NEXT: vpmovdw %ymm1, %xmm1
@@ -56,9 +57,10 @@ define <16 x i8> @testv16i1_sext_v16i8(<8 x i32>* %p, <8 x i32>* %q) {
;
; AVX512VL-LABEL: testv16i1_sext_v16i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
-; AVX512VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k1
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0
+; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
@@ -88,9 +90,10 @@ define <16 x i8> @testv16i1_sext_v16i8(<8 x i32>* %p, <8 x i32>* %q) {
define <16 x i16> @testv16i1_sext_v16i16(<8 x i32>* %p, <8 x i32>* %q) {
; AVX256-LABEL: testv16i1_sext_v16i16:
; AVX256: # %bb.0:
-; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
-; AVX256-NEXT: vpcmpeqd (%rsi), %ymm0, %k2
+; AVX256-NEXT: vmovdqa (%rdi), %ymm0
+; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1
+; AVX256-NEXT: vmovdqa (%rsi), %ymm0
+; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2
; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
; AVX256-NEXT: vpmovdw %ymm1, %xmm1
@@ -101,9 +104,10 @@ define <16 x i16> @testv16i1_sext_v16i16(<8 x i32>* %p, <8 x i32>* %q) {
;
; AVX512VL-LABEL: testv16i1_sext_v16i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
-; AVX512VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k1
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0
+; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
@@ -131,8 +135,8 @@ define <16 x i16> @testv16i1_sext_v16i16(<8 x i32>* %p, <8 x i32>* %q) {
define <8 x i16> @testv8i1_zext_v8i16(<8 x i32>* %p) {
; AVX256-LABEL: testv8i1_zext_v8i16:
; AVX256: # %bb.0:
-; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
+; AVX256-NEXT: vmovdqa (%rdi), %ymm0
+; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1
; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX256-NEXT: vpmovdw %ymm0, %xmm0
@@ -142,8 +146,8 @@ define <8 x i16> @testv8i1_zext_v8i16(<8 x i32>* %p) {
;
; AVX512VL-LABEL: testv8i1_zext_v8i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
@@ -168,9 +172,10 @@ define <8 x i16> @testv8i1_zext_v8i16(<8 x i32>* %p) {
define <16 x i8> @testv16i1_zext_v16i8(<8 x i32>* %p, <8 x i32>* %q) {
; AVX256-LABEL: testv16i1_zext_v16i8:
; AVX256: # %bb.0:
-; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
-; AVX256-NEXT: vpcmpeqd (%rsi), %ymm0, %k2
+; AVX256-NEXT: vmovdqa (%rdi), %ymm0
+; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1
+; AVX256-NEXT: vmovdqa (%rsi), %ymm0
+; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2
; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z}
; AVX256-NEXT: vpmovdw %ymm1, %xmm1
@@ -184,9 +189,10 @@ define <16 x i8> @testv16i1_zext_v16i8(<8 x i32>* %p, <8 x i32>* %q) {
;
; AVX512VL-LABEL: testv16i1_zext_v16i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
-; AVX512VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k1
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0
+; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1
; AVX512VL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
@@ -216,9 +222,10 @@ define <16 x i8> @testv16i1_zext_v16i8(<8 x i32>* %p, <8 x i32>* %q) {
define <16 x i16> @testv16i1_zext_v16i16(<8 x i32>* %p, <8 x i32>* %q) {
; AVX256-LABEL: testv16i1_zext_v16i16:
; AVX256: # %bb.0:
-; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
-; AVX256-NEXT: vpcmpeqd (%rsi), %ymm0, %k2
+; AVX256-NEXT: vmovdqa (%rdi), %ymm0
+; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1
+; AVX256-NEXT: vmovdqa (%rsi), %ymm0
+; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2
; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
; AVX256-NEXT: vpmovdw %ymm1, %xmm1
@@ -230,9 +237,10 @@ define <16 x i16> @testv16i1_zext_v16i16(<8 x i32>* %p, <8 x i32>* %q) {
;
; AVX512VL-LABEL: testv16i1_zext_v16i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
-; AVX512VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k1
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0
+; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
index 1983b7a638d..75564986809 100644
--- a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
+++ b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
@@ -11,9 +11,10 @@
define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<8 x i32>* %a, <8 x i32>* %b) {
; AVX256VL-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
; AVX256VL: # %bb.0:
-; AVX256VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX256VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
-; AVX256VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k2
+; AVX256VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX256VL-NEXT: vmovdqa (%rsi), %ymm1
+; AVX256VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
+; AVX256VL-NEXT: vptestnmd %ymm1, %ymm1, %k2
; AVX256VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z}
; AVX256VL-NEXT: vpmovdw %ymm1, %xmm1
@@ -42,9 +43,10 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<8 x i32>* %a, <8
;
; AVX512VL-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
-; AVX512VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k2
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vmovdqa (%rsi), %ymm1
+; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
+; AVX512VL-NEXT: vptestnmd %ymm1, %ymm1, %k2
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
@@ -57,9 +59,10 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<8 x i32>* %a, <8
;
; AVX256VLBW-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
; AVX256VLBW: # %bb.0:
-; AVX256VLBW-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX256VLBW-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
-; AVX256VLBW-NEXT: vpcmpeqd (%rsi), %ymm0, %k1
+; AVX256VLBW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX256VLBW-NEXT: vmovdqa (%rsi), %ymm1
+; AVX256VLBW-NEXT: vptestnmd %ymm0, %ymm0, %k0
+; AVX256VLBW-NEXT: vptestnmd %ymm1, %ymm1, %k1
; AVX256VLBW-NEXT: vpmovm2w %k1, %ymm0
; AVX256VLBW-NEXT: vpmovm2w %k0, %ymm1
; AVX256VLBW-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
@@ -71,9 +74,10 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<8 x i32>* %a, <8
;
; AVX512VLBW-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
-; AVX512VLBW-NEXT: vpcmpeqd (%rsi), %ymm0, %k2
+; AVX512VLBW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VLBW-NEXT: vmovdqa (%rsi), %ymm1
+; AVX512VLBW-NEXT: vptestnmd %ymm0, %ymm0, %k1
+; AVX512VLBW-NEXT: vptestnmd %ymm1, %ymm1, %k2
; AVX512VLBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512VLBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
diff --git a/llvm/test/CodeGen/X86/setcc-lowering.ll b/llvm/test/CodeGen/X86/setcc-lowering.ll
index 9bcb4fd1e63..1cf20bf62e4 100644
--- a/llvm/test/CodeGen/X86/setcc-lowering.ll
+++ b/llvm/test/CodeGen/X86/setcc-lowering.ll
@@ -24,8 +24,7 @@ define <8 x i16> @pr25080(<8 x i32> %a) {
; KNL-32-LABEL: pr25080:
; KNL-32: # %bb.0: # %entry
; KNL-32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [8388607,8388607,8388607,8388607,8388607,8388607,8388607,8388607]
-; KNL-32-NEXT: vptestnmd %zmm1, %zmm0, %k0
+; KNL-32-NEXT: vptestnmd {{\.LCPI.*}}{1to16}, %zmm0, %k0
; KNL-32-NEXT: movb $15, %al
; KNL-32-NEXT: kmovw %eax, %k1
; KNL-32-NEXT: korw %k1, %k0, %k1
diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll
index 4a51e3341b2..e7e187629e5 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll
@@ -108,16 +108,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
;
; AVX512F-LABEL: var_funnnel_v2i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512F-NEXT: vpsllvq %xmm4, %xmm0, %xmm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT: vpsllvq %xmm2, %xmm0, %xmm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
-; AVX512F-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqa %xmm1, %xmm0
; AVX512F-NEXT: vzeroupper
@@ -125,30 +123,27 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
;
; AVX512VL-LABEL: var_funnnel_v2i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VL-NEXT: vpsllvq %xmm4, %xmm0, %xmm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT: vpsllvq %xmm2, %xmm0, %xmm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
-; AVX512VL-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512VL-NEXT: vptestnmq %xmm3, %xmm2, %k1
+; AVX512VL-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512VL-NEXT: vptestnmq %xmm2, %xmm2, %k1
; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1}
; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v2i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT: vpsllvq %xmm4, %xmm0, %xmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpsllvq %xmm2, %xmm0, %xmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
-; AVX512BW-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
; AVX512BW-NEXT: vzeroupper
@@ -156,16 +151,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
;
; AVX512VBMI2-LABEL: var_funnnel_v2i64:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT: vpsllvq %xmm4, %xmm0, %xmm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VBMI2-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpsllvq %xmm2, %xmm0, %xmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
-; AVX512VBMI2-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VBMI2-NEXT: vzeroupper
@@ -173,14 +166,13 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
;
; AVX512VLBW-LABEL: var_funnnel_v2i64:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT: vpsllvq %xmm4, %xmm0, %xmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpsllvq %xmm2, %xmm0, %xmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
-; AVX512VLBW-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512VLBW-NEXT: vptestnmq %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512VLBW-NEXT: vptestnmq %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VLBW-NEXT: retq
@@ -358,16 +350,15 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
;
; AVX512F-LABEL: var_funnnel_v4i32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512F-NEXT: vpsllvd %xmm4, %xmm0, %xmm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpsllvd %xmm2, %xmm0, %xmm3
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
-; AVX512F-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqa %xmm1, %xmm0
; AVX512F-NEXT: vzeroupper
@@ -375,30 +366,28 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
;
; AVX512VL-LABEL: var_funnnel_v4i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VL-NEXT: vpsllvd %xmm4, %xmm0, %xmm5
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
+; AVX512VL-NEXT: vpsllvd %xmm2, %xmm0, %xmm3
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
-; AVX512VL-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512VL-NEXT: vptestnmd %xmm3, %xmm2, %k1
+; AVX512VL-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512VL-NEXT: vptestnmd %xmm2, %xmm2, %k1
; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v4i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT: vpsllvd %xmm4, %xmm0, %xmm5
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsllvd %xmm2, %xmm0, %xmm3
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
-; AVX512BW-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
; AVX512BW-NEXT: vzeroupper
@@ -406,16 +395,15 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
;
; AVX512VBMI2-LABEL: var_funnnel_v4i32:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT: vpsllvd %xmm4, %xmm0, %xmm5
-; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VBMI2-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpsllvd %xmm2, %xmm0, %xmm3
+; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
-; AVX512VBMI2-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VBMI2-NEXT: vzeroupper
@@ -423,14 +411,13 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
;
; AVX512VLBW-LABEL: var_funnnel_v4i32:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT: vpsllvd %xmm4, %xmm0, %xmm5
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpsllvd %xmm2, %xmm0, %xmm3
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
-; AVX512VLBW-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512VLBW-NEXT: vptestnmd %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512VLBW-NEXT: vptestnmd %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VLBW-NEXT: retq
@@ -719,17 +706,15 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
;
; AVX512BW-LABEL: var_funnnel_v8i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
; AVX512BW-NEXT: vzeroupper
@@ -737,17 +722,15 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
;
; AVX512VBMI2-LABEL: var_funnnel_v8i16:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VBMI2-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VBMI2-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VBMI2-NEXT: vzeroupper
@@ -755,14 +738,13 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
;
; AVX512VLBW-LABEL: var_funnnel_v8i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT: vpsllvw %xmm4, %xmm0, %xmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpsllvw %xmm2, %xmm0, %xmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpsrlvw %xmm4, %xmm1, %xmm1
-; AVX512VLBW-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512VLBW-NEXT: vptestnmw %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512VLBW-NEXT: vptestnmw %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VLBW-NEXT: retq
@@ -1040,21 +1022,19 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
;
; AVX512BW-LABEL: var_funnnel_v16i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm5
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512BW-NEXT: vpsllvw %zmm6, %zmm3, %zmm3
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512BW-NEXT: vpsllvw %zmm4, %zmm3, %zmm3
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %xmm5, %xmm6, %xmm5
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512BW-NEXT: vpsrlvw %zmm5, %zmm1, %zmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT: vptestnmb %zmm4, %zmm2, %k1
+; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
; AVX512BW-NEXT: vzeroupper
@@ -1062,21 +1042,19 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
;
; AVX512VBMI2-LABEL: var_funnnel_v16i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT: vpand %xmm4, %xmm2, %xmm5
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512VBMI2-NEXT: vpsllvw %zmm6, %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm3, %zmm3
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %xmm5, %xmm6, %xmm5
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512VBMI2-NEXT: vpsrlvw %zmm5, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512VBMI2-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512VBMI2-NEXT: vptestnmb %zmm4, %zmm2, %k1
+; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VBMI2-NEXT: vzeroupper
@@ -1084,19 +1062,18 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
;
; AVX512VLBW-LABEL: var_funnnel_v16i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VLBW-NEXT: vpsllvw %ymm5, %ymm6, %ymm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm3, %ymm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT: vpor %ymm1, %ymm5, %ymm1
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512VLBW-NEXT: vpmovwb %ymm1, %xmm1
-; AVX512VLBW-NEXT: vptestnmb %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT: vptestnmb %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VLBW-NEXT: vzeroupper
@@ -1104,19 +1081,18 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
;
; AVX512VLVBMI2-LABEL: var_funnnel_v16i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VLVBMI2-NEXT: vpsllvw %ymm5, %ymm6, %ymm5
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLVBMI2-NEXT: vpsllvw %ymm4, %ymm3, %ymm3
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512VLVBMI2-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1
-; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1
+; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512VLVBMI2-NEXT: vpmovwb %ymm1, %xmm1
-; AVX512VLVBMI2-NEXT: vptestnmb %xmm3, %xmm2, %k1
+; AVX512VLVBMI2-NEXT: vptestnmb %xmm2, %xmm2, %k1
; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VLVBMI2-NEXT: vzeroupper
@@ -1276,14 +1252,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512F-NEXT: vpsllq %xmm4, %xmm0, %xmm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT: vpsllq %xmm2, %xmm0, %xmm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
-; AVX512F-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqa %xmm1, %xmm0
; AVX512F-NEXT: vzeroupper
@@ -1292,14 +1267,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
; AVX512VL-LABEL: splatvar_funnnel_v2i64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VL-NEXT: vpsllq %xmm4, %xmm0, %xmm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT: vpsllq %xmm2, %xmm0, %xmm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
-; AVX512VL-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512VL-NEXT: vptestnmq %xmm3, %xmm2, %k1
+; AVX512VL-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512VL-NEXT: vptestnmq %xmm2, %xmm2, %k1
; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1}
; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VL-NEXT: retq
@@ -1308,14 +1282,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT: vpsllq %xmm4, %xmm0, %xmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpsllq %xmm2, %xmm0, %xmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
-; AVX512BW-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
; AVX512BW-NEXT: vzeroupper
@@ -1325,14 +1298,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT: vpsllq %xmm4, %xmm0, %xmm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VBMI2-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpsllq %xmm2, %xmm0, %xmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
-; AVX512VBMI2-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VBMI2-NEXT: vzeroupper
@@ -1341,14 +1313,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
; AVX512VLBW-LABEL: splatvar_funnnel_v2i64:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT: vpsllq %xmm4, %xmm0, %xmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpsllq %xmm2, %xmm0, %xmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
-; AVX512VLBW-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512VLBW-NEXT: vptestnmq %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512VLBW-NEXT: vptestnmq %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VLBW-NEXT: retq
@@ -1497,15 +1468,15 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vpbroadcastd %xmm2, %xmm2
; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512F-NEXT: vpslld %xmm5, %xmm0, %xmm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512F-NEXT: vpslld %xmm3, %xmm0, %xmm3
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512F-NEXT: vpsrld %xmm4, %xmm1, %xmm1
-; AVX512F-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqa %xmm1, %xmm0
; AVX512F-NEXT: vzeroupper
@@ -1514,16 +1485,15 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
; AVX512VL-LABEL: splatvar_funnnel_v4i32:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VL-NEXT: vpslld %xmm5, %xmm0, %xmm5
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VL-NEXT: vpslld %xmm3, %xmm0, %xmm3
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512VL-NEXT: vpsrld %xmm4, %xmm1, %xmm1
-; AVX512VL-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512VL-NEXT: vptestnmd %xmm3, %xmm2, %k1
+; AVX512VL-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512VL-NEXT: vptestnmd %xmm2, %xmm2, %k1
; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VL-NEXT: retq
@@ -1533,15 +1503,15 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vpbroadcastd %xmm2, %xmm2
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512BW-NEXT: vpslld %xmm5, %xmm0, %xmm5
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512BW-NEXT: vpslld %xmm3, %xmm0, %xmm3
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512BW-NEXT: vpsrld %xmm4, %xmm1, %xmm1
-; AVX512BW-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
; AVX512BW-NEXT: vzeroupper
@@ -1552,15 +1522,15 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %xmm2
; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VBMI2-NEXT: vpslld %xmm5, %xmm0, %xmm5
-; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VBMI2-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VBMI2-NEXT: vpslld %xmm3, %xmm0, %xmm3
+; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512VBMI2-NEXT: vpsrld %xmm4, %xmm1, %xmm1
-; AVX512VBMI2-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VBMI2-NEXT: vzeroupper
@@ -1569,16 +1539,15 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
; AVX512VLBW-LABEL: splatvar_funnnel_v4i32:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VLBW-NEXT: vpslld %xmm5, %xmm0, %xmm5
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VLBW-NEXT: vpslld %xmm3, %xmm0, %xmm3
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512VLBW-NEXT: vpsrld %xmm4, %xmm1, %xmm1
-; AVX512VLBW-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512VLBW-NEXT: vptestnmd %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512VLBW-NEXT: vptestnmd %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VLBW-NEXT: retq
@@ -1759,16 +1728,15 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512BW-NEXT: vpsllw %xmm5, %xmm0, %xmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512BW-NEXT: vpsllw %xmm3, %xmm0, %xmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512BW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
-; AVX512BW-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
; AVX512BW-NEXT: vzeroupper
@@ -1778,16 +1746,15 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VBMI2-NEXT: vpsllw %xmm5, %xmm0, %xmm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VBMI2-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VBMI2-NEXT: vpsllw %xmm3, %xmm0, %xmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512VBMI2-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
-; AVX512VBMI2-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VBMI2-NEXT: vzeroupper
@@ -1796,16 +1763,15 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VLBW-NEXT: vpsllw %xmm5, %xmm0, %xmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VLBW-NEXT: vpsllw %xmm3, %xmm0, %xmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
-; AVX512VLBW-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512VLBW-NEXT: vptestnmw %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512VLBW-NEXT: vptestnmw %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VLBW-NEXT: retq
@@ -2031,18 +1997,17 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm5
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512BW-NEXT: vpsllvw %zmm6, %zmm3, %zmm3
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512BW-NEXT: vpsllvw %zmm4, %zmm3, %zmm3
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %xmm5, %xmm6, %xmm5
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512BW-NEXT: vpsrlvw %zmm5, %zmm1, %zmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT: vptestnmb %zmm4, %zmm2, %k1
+; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
; AVX512BW-NEXT: vzeroupper
@@ -2053,18 +2018,17 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT: vpand %xmm4, %xmm2, %xmm5
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512VBMI2-NEXT: vpsllvw %zmm6, %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm3, %zmm3
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %xmm5, %xmm6, %xmm5
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512VBMI2-NEXT: vpsrlvw %zmm5, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512VBMI2-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512VBMI2-NEXT: vptestnmb %zmm4, %zmm2, %k1
+; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VBMI2-NEXT: vzeroupper
@@ -2073,19 +2037,18 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
; AVX512VLBW-LABEL: splatvar_funnnel_v16i8:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VLBW-NEXT: vpsllvw %ymm5, %ymm6, %ymm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm3, %ymm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT: vpor %ymm1, %ymm5, %ymm1
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512VLBW-NEXT: vpmovwb %ymm1, %xmm1
-; AVX512VLBW-NEXT: vptestnmb %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT: vptestnmb %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VLBW-NEXT: vzeroupper
@@ -2094,19 +2057,18 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8:
; AVX512VLVBMI2: # %bb.0:
; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm2
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VLVBMI2-NEXT: vpsllvw %ymm5, %ymm6, %ymm5
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLVBMI2-NEXT: vpsllvw %ymm4, %ymm3, %ymm3
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512VLVBMI2-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1
-; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1
+; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512VLVBMI2-NEXT: vpmovwb %ymm1, %xmm1
-; AVX512VLVBMI2-NEXT: vptestnmb %xmm3, %xmm2, %k1
+; AVX512VLVBMI2-NEXT: vptestnmb %xmm2, %xmm2, %k1
; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VLVBMI2-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll
index 0133d9acca7..bf89b154930 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll
@@ -71,76 +71,71 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt)
;
; AVX512F-LABEL: var_funnnel_v4i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT: vpsllvq %ymm4, %ymm0, %ymm5
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
-; AVX512F-NEXT: vpsubq %ymm4, %ymm6, %ymm4
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsllvq %ymm2, %ymm0, %ymm3
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; AVX512F-NEXT: vpsubq %ymm2, %ymm4, %ymm4
; AVX512F-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqa %ymm1, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v4i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VL-NEXT: vpsllvq %ymm4, %ymm0, %ymm5
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
-; AVX512VL-NEXT: vpsubq %ymm4, %ymm6, %ymm4
+; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsllvq %ymm2, %ymm0, %ymm3
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; AVX512VL-NEXT: vpsubq %ymm2, %ymm4, %ymm4
; AVX512VL-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512VL-NEXT: vptestnmq %ymm3, %ymm2, %k1
+; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT: vptestnmq %ymm2, %ymm2, %k1
; AVX512VL-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1}
; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v4i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT: vpsllvq %ymm4, %ymm0, %ymm5
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
-; AVX512BW-NEXT: vpsubq %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT: vpsllvq %ymm2, %ymm0, %ymm3
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; AVX512BW-NEXT: vpsubq %ymm2, %ymm4, %ymm4
; AVX512BW-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
-; AVX512BW-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v4i64:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT: vpsllvq %ymm4, %ymm0, %ymm5
-; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
-; AVX512VBMI2-NEXT: vpsubq %ymm4, %ymm6, %ymm4
+; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpsllvq %ymm2, %ymm0, %ymm3
+; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; AVX512VBMI2-NEXT: vpsubq %ymm2, %ymm4, %ymm4
; AVX512VBMI2-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
-; AVX512VBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v4i64:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT: vpsllvq %ymm4, %ymm0, %ymm5
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
-; AVX512VLBW-NEXT: vpsubq %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpsllvq %ymm2, %ymm0, %ymm3
+; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; AVX512VLBW-NEXT: vpsubq %ymm2, %ymm4, %ymm4
; AVX512VLBW-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512VLBW-NEXT: vptestnmq %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VLBW-NEXT: vptestnmq %ymm2, %ymm2, %k1
; AVX512VLBW-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VLBW-NEXT: retq
@@ -257,76 +252,71 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt)
;
; AVX512F-LABEL: var_funnnel_v8i32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT: vpsllvd %ymm4, %ymm0, %ymm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
-; AVX512F-NEXT: vpsubd %ymm4, %ymm6, %ymm4
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsllvd %ymm2, %ymm0, %ymm3
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; AVX512F-NEXT: vpsubd %ymm2, %ymm4, %ymm4
; AVX512F-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqa %ymm1, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v8i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VL-NEXT: vpsllvd %ymm4, %ymm0, %ymm5
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
-; AVX512VL-NEXT: vpsubd %ymm4, %ymm6, %ymm4
+; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsllvd %ymm2, %ymm0, %ymm3
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; AVX512VL-NEXT: vpsubd %ymm2, %ymm4, %ymm4
; AVX512VL-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512VL-NEXT: vptestnmd %ymm3, %ymm2, %k1
+; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT: vptestnmd %ymm2, %ymm2, %k1
; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v8i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT: vpsllvd %ymm4, %ymm0, %ymm5
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
-; AVX512BW-NEXT: vpsubd %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT: vpsllvd %ymm2, %ymm0, %ymm3
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT: vpsubd %ymm2, %ymm4, %ymm4
; AVX512BW-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
-; AVX512BW-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v8i32:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT: vpsllvd %ymm4, %ymm0, %ymm5
-; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
-; AVX512VBMI2-NEXT: vpsubd %ymm4, %ymm6, %ymm4
+; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpsllvd %ymm2, %ymm0, %ymm3
+; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; AVX512VBMI2-NEXT: vpsubd %ymm2, %ymm4, %ymm4
; AVX512VBMI2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
-; AVX512VBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v8i32:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT: vpsllvd %ymm4, %ymm0, %ymm5
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpsllvd %ymm2, %ymm0, %ymm3
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; AVX512VLBW-NEXT: vpsubd %ymm2, %ymm4, %ymm4
; AVX512VLBW-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512VLBW-NEXT: vptestnmd %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VLBW-NEXT: vptestnmd %ymm2, %ymm2, %k1
; AVX512VLBW-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VLBW-NEXT: retq
@@ -514,48 +504,43 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
;
; AVX512BW-LABEL: var_funnnel_v16i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpsubw %ymm2, %ymm4, %ymm4
; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v16i16:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VBMI2-NEXT: vpsubw %ymm4, %ymm6, %ymm4
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VBMI2-NEXT: vpsubw %ymm2, %ymm4, %ymm4
; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v16i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm0, %ymm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpsllvw %ymm2, %ymm0, %ymm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpsubw %ymm2, %ymm4, %ymm4
; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512VLBW-NEXT: vptestnmw %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VLBW-NEXT: vptestnmw %ymm2, %ymm2, %k1
; AVX512VLBW-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VLBW-NEXT: retq
@@ -792,80 +777,74 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
;
; AVX512BW-LABEL: var_funnnel_v32i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT: vpsllvw %zmm5, %zmm6, %zmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512BW-NEXT: vpsllvw %zmm4, %zmm3, %zmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpsubb %ymm2, %ymm4, %ymm4
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm5, %zmm1
+; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v32i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VBMI2-NEXT: vpsllvw %zmm5, %zmm6, %zmm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VBMI2-NEXT: vporq %zmm1, %zmm5, %zmm1
+; AVX512VBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
; AVX512VBMI2-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512VBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v32i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VLBW-NEXT: vpsllvw %zmm5, %zmm6, %zmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLBW-NEXT: vpsllvw %zmm4, %zmm3, %zmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT: vpsubb %ymm2, %ymm4, %ymm4
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT: vporq %zmm1, %zmm5, %zmm1
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
; AVX512VLBW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512VLBW-NEXT: vptestnmb %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT: vptestnmb %ymm2, %ymm2, %k1
; AVX512VLBW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v32i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VLVBMI2-NEXT: vpsllvw %zmm5, %zmm6, %zmm5
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLVBMI2-NEXT: vpsllvw %zmm4, %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512VLVBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm5, %zmm1
+; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
; AVX512VLVBMI2-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512VLVBMI2-NEXT: vptestnmb %ymm3, %ymm2, %k1
+; AVX512VLVBMI2-NEXT: vptestnmb %ymm2, %ymm2, %k1
; AVX512VLVBMI2-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
; AVX512VLVBMI2-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VLVBMI2-NEXT: retq
@@ -970,13 +949,13 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2
; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT: vpsllq %xmm4, %ymm0, %ymm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsllq %xmm2, %ymm0, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqa %ymm1, %ymm0
; AVX512F-NEXT: retq
@@ -984,14 +963,13 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
; AVX512VL-LABEL: splatvar_funnnel_v4i64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastq %xmm2, %ymm2
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VL-NEXT: vpsllq %xmm4, %ymm0, %ymm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsllq %xmm2, %ymm0, %ymm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512VL-NEXT: vptestnmq %ymm3, %ymm2, %k1
+; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT: vptestnmq %ymm2, %ymm2, %k1
; AVX512VL-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1}
; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VL-NEXT: retq
@@ -1001,13 +979,13 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-NEXT: vpbroadcastq %xmm2, %ymm2
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT: vpsllq %xmm4, %ymm0, %ymm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT: vpsllq %xmm2, %ymm0, %ymm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
-; AVX512BW-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512BW-NEXT: retq
@@ -1017,13 +995,13 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %ymm2
; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT: vpsllq %xmm4, %ymm0, %ymm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VBMI2-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpsllq %xmm2, %ymm0, %ymm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
-; AVX512VBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VBMI2-NEXT: retq
@@ -1031,14 +1009,13 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
; AVX512VLBW-LABEL: splatvar_funnnel_v4i64:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %ymm2
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT: vpsllq %xmm4, %ymm0, %ymm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpsllq %xmm2, %ymm0, %ymm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512VLBW-NEXT: vptestnmq %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VLBW-NEXT: vptestnmq %ymm2, %ymm2, %k1
; AVX512VLBW-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VLBW-NEXT: retq
@@ -1144,15 +1121,15 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vpbroadcastd %xmm2, %ymm2
; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512F-NEXT: vpslld %xmm5, %ymm0, %ymm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512F-NEXT: vpslld %xmm3, %ymm0, %ymm3
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512F-NEXT: vpsrld %xmm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqa %ymm1, %ymm0
; AVX512F-NEXT: retq
@@ -1160,16 +1137,15 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
; AVX512VL-LABEL: splatvar_funnnel_v8i32:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastd %xmm2, %ymm2
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VL-NEXT: vpslld %xmm5, %ymm0, %ymm5
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VL-NEXT: vpslld %xmm3, %ymm0, %ymm3
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512VL-NEXT: vpsrld %xmm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512VL-NEXT: vptestnmd %ymm3, %ymm2, %k1
+; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT: vptestnmd %ymm2, %ymm2, %k1
; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VL-NEXT: retq
@@ -1179,15 +1155,15 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-NEXT: vpbroadcastd %xmm2, %ymm2
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512BW-NEXT: vpslld %xmm5, %ymm0, %ymm5
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512BW-NEXT: vpslld %xmm3, %ymm0, %ymm3
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512BW-NEXT: vpsrld %xmm4, %ymm1, %ymm1
-; AVX512BW-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512BW-NEXT: retq
@@ -1197,15 +1173,15 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %ymm2
; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VBMI2-NEXT: vpslld %xmm5, %ymm0, %ymm5
-; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VBMI2-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VBMI2-NEXT: vpslld %xmm3, %ymm0, %ymm3
+; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512VBMI2-NEXT: vpsrld %xmm4, %ymm1, %ymm1
-; AVX512VBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VBMI2-NEXT: retq
@@ -1213,16 +1189,15 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
; AVX512VLBW-LABEL: splatvar_funnnel_v8i32:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %ymm2
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VLBW-NEXT: vpslld %xmm5, %ymm0, %ymm5
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VLBW-NEXT: vpslld %xmm3, %ymm0, %ymm3
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512VLBW-NEXT: vpsrld %xmm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512VLBW-NEXT: vptestnmd %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VLBW-NEXT: vptestnmd %ymm2, %ymm2, %k1
; AVX512VLBW-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VLBW-NEXT: retq
@@ -1366,16 +1341,15 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512BW-NEXT: vpsllw %xmm5, %ymm0, %ymm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512BW-NEXT: vpsllw %xmm3, %ymm0, %ymm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512BW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
-; AVX512BW-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512BW-NEXT: retq
@@ -1384,16 +1358,15 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VBMI2-NEXT: vpsllw %xmm5, %ymm0, %ymm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VBMI2-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VBMI2-NEXT: vpsllw %xmm3, %ymm0, %ymm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512VBMI2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
-; AVX512VBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VBMI2-NEXT: retq
@@ -1401,16 +1374,15 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
; AVX512VLBW-LABEL: splatvar_funnnel_v16i16:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VLBW-NEXT: vpsllw %xmm5, %ymm0, %ymm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VLBW-NEXT: vpsllw %xmm3, %ymm0, %ymm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512VLBW-NEXT: vptestnmw %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VLBW-NEXT: vptestnmw %ymm2, %ymm2, %k1
; AVX512VLBW-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VLBW-NEXT: retq
@@ -1589,19 +1561,18 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT: vpsllvw %zmm5, %zmm6, %zmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512BW-NEXT: vpsllvw %zmm4, %zmm3, %zmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpsubb %ymm2, %ymm4, %ymm4
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm5, %zmm1
+; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512BW-NEXT: retq
@@ -1610,19 +1581,18 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VBMI2-NEXT: vpsllvw %zmm5, %zmm6, %zmm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VBMI2-NEXT: vporq %zmm1, %zmm5, %zmm1
+; AVX512VBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
; AVX512VBMI2-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512VBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VBMI2-NEXT: retq
@@ -1630,19 +1600,18 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
; AVX512VLBW-LABEL: splatvar_funnnel_v32i8:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VLBW-NEXT: vpsllvw %zmm5, %zmm6, %zmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLBW-NEXT: vpsllvw %zmm4, %zmm3, %zmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT: vpsubb %ymm2, %ymm4, %ymm4
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT: vporq %zmm1, %zmm5, %zmm1
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
; AVX512VLBW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512VLBW-NEXT: vptestnmb %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT: vptestnmb %ymm2, %ymm2, %k1
; AVX512VLBW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VLBW-NEXT: retq
@@ -1650,19 +1619,18 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8:
; AVX512VLVBMI2: # %bb.0:
; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VLVBMI2-NEXT: vpsllvw %zmm5, %zmm6, %zmm5
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLVBMI2-NEXT: vpsllvw %zmm4, %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512VLVBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm5, %zmm1
+; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
; AVX512VLVBMI2-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512VLVBMI2-NEXT: vptestnmb %ymm3, %ymm2, %k1
+; AVX512VLVBMI2-NEXT: vptestnmb %ymm2, %ymm2, %k1
; AVX512VLVBMI2-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
; AVX512VLVBMI2-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VLVBMI2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll
index cb29f33b9e8..ca7a26a6b3e 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll
@@ -18,42 +18,39 @@ declare <64 x i8> @llvm.fshl.v64i8(<64 x i8>, <64 x i8>, <64 x i8>)
define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v8i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512F-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT: vpsllvq %zmm4, %zmm0, %zmm5
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64]
-; AVX512F-NEXT: vpsubq %zmm4, %zmm6, %zmm4
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllvq %zmm2, %zmm0, %zmm3
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
+; AVX512F-NEXT: vpsubq %zmm2, %zmm4, %zmm4
; AVX512F-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1
-; AVX512F-NEXT: vporq %zmm1, %zmm5, %zmm1
-; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v8i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512VL-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VL-NEXT: vpsllvq %zmm4, %zmm0, %zmm5
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64]
-; AVX512VL-NEXT: vpsubq %zmm4, %zmm6, %zmm4
+; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512VL-NEXT: vpsllvq %zmm2, %zmm0, %zmm3
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
+; AVX512VL-NEXT: vpsubq %zmm2, %zmm4, %zmm4
; AVX512VL-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1
-; AVX512VL-NEXT: vporq %zmm1, %zmm5, %zmm1
-; AVX512VL-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512VL-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512VL-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512VL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT: vpsllvq %zmm4, %zmm0, %zmm5
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64]
-; AVX512BW-NEXT: vpsubq %zmm4, %zmm6, %zmm4
+; AVX512BW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512BW-NEXT: vpsllvq %zmm2, %zmm0, %zmm3
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
+; AVX512BW-NEXT: vpsubq %zmm2, %zmm4, %zmm4
; AVX512BW-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm5, %zmm1
-; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
@@ -65,14 +62,13 @@ define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt)
;
; AVX512VLBW-LABEL: var_funnnel_v8i64:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vpsllvq %zmm4, %zmm0, %zmm5
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64]
-; AVX512VLBW-NEXT: vpsubq %zmm4, %zmm6, %zmm4
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpsllvq %zmm2, %zmm0, %zmm3
+; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
+; AVX512VLBW-NEXT: vpsubq %zmm2, %zmm4, %zmm4
; AVX512VLBW-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT: vporq %zmm1, %zmm5, %zmm1
-; AVX512VLBW-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512VLBW-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512VLBW-NEXT: retq
@@ -88,42 +84,39 @@ define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt)
define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v16i32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512F-NEXT: vpandd %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT: vpsllvd %zmm4, %zmm0, %zmm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512F-NEXT: vpsubd %zmm4, %zmm6, %zmm4
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm3
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512F-NEXT: vpsubd %zmm2, %zmm4, %zmm4
; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
-; AVX512F-NEXT: vpord %zmm1, %zmm5, %zmm1
-; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpord %zmm1, %zmm3, %zmm1
+; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v16i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512VL-NEXT: vpandd %zmm3, %zmm2, %zmm4
-; AVX512VL-NEXT: vpsllvd %zmm4, %zmm0, %zmm5
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512VL-NEXT: vpsubd %zmm4, %zmm6, %zmm4
+; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm3
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512VL-NEXT: vpsubd %zmm2, %zmm4, %zmm4
; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
-; AVX512VL-NEXT: vpord %zmm1, %zmm5, %zmm1
-; AVX512VL-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512VL-NEXT: vpord %zmm1, %zmm3, %zmm1
+; AVX512VL-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512VL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512BW-NEXT: vpandd %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT: vpsllvd %zmm4, %zmm0, %zmm5
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512BW-NEXT: vpsubd %zmm4, %zmm6, %zmm4
+; AVX512BW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512BW-NEXT: vpsllvd %zmm2, %zmm0, %zmm3
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT: vpsubd %zmm2, %zmm4, %zmm4
; AVX512BW-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vpord %zmm1, %zmm5, %zmm1
-; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpord %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
@@ -135,14 +128,13 @@ define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %
;
; AVX512VLBW-LABEL: var_funnnel_v16i32:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512VLBW-NEXT: vpandd %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vpsllvd %zmm4, %zmm0, %zmm5
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %zmm4, %zmm6, %zmm4
+; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpsllvd %zmm2, %zmm0, %zmm3
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512VLBW-NEXT: vpsubd %zmm2, %zmm4, %zmm4
; AVX512VLBW-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT: vpord %zmm1, %zmm5, %zmm1
-; AVX512VLBW-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT: vpord %zmm1, %zmm3, %zmm1
+; AVX512VLBW-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512VLBW-NEXT: retq
@@ -220,14 +212,13 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %
;
; AVX512BW-LABEL: var_funnnel_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm5
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %zmm4, %zmm6, %zmm4
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm3
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpsubw %zmm2, %zmm4, %zmm4
; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm5, %zmm1
-; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
@@ -239,14 +230,13 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %
;
; AVX512VLBW-LABEL: var_funnnel_v32i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vpsllvw %zmm4, %zmm0, %zmm5
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %zmm4, %zmm6, %zmm4
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpsllvw %zmm2, %zmm0, %zmm3
+; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpsubw %zmm2, %zmm4, %zmm4
; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT: vporq %zmm1, %zmm5, %zmm1
-; AVX512VLBW-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512VLBW-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512VLBW-NEXT: retq
@@ -398,160 +388,156 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
;
; AVX512BW-LABEL: var_funnnel_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %zmm4, %zmm5, %zmm5
-; AVX512BW-NEXT: vpsllw $5, %zmm5, %zmm5
-; AVX512BW-NEXT: vpaddb %zmm5, %zmm5, %zmm6
-; AVX512BW-NEXT: vpmovb2m %zmm6, %k1
-; AVX512BW-NEXT: vpmovb2m %zmm5, %k2
-; AVX512BW-NEXT: vpsrlw $4, %zmm1, %zmm5
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm1 {%k2}
-; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm5
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm1 {%k1}
-; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm5
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpsubb %zmm2, %zmm3, %zmm3
+; AVX512BW-NEXT: vpsllw $5, %zmm3, %zmm3
+; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm4
+; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT: vpmovb2m %zmm3, %k2
+; AVX512BW-NEXT: vpsrlw $4, %zmm1, %zmm3
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k2}
+; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm3
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
+; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm3
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
+; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm3
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT: vpsllw $5, %zmm2, %zmm4
+; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
+; AVX512BW-NEXT: vpsllw $2, %zmm3, %zmm5
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512BW-NEXT: vpaddb %zmm6, %zmm6, %zmm6
-; AVX512BW-NEXT: vpmovb2m %zmm6, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm1 {%k1}
-; AVX512BW-NEXT: vpsllw $5, %zmm4, %zmm4
-; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm5
-; AVX512BW-NEXT: vpmovb2m %zmm5, %k1
-; AVX512BW-NEXT: vpmovb2m %zmm4, %k2
-; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm4
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
-; AVX512BW-NEXT: vpblendmb %zmm4, %zmm0, %zmm4 {%k2}
-; AVX512BW-NEXT: vpsllw $2, %zmm4, %zmm6
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm6, %zmm6
-; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm4 {%k1}
-; AVX512BW-NEXT: vpaddb %zmm5, %zmm5, %zmm5
-; AVX512BW-NEXT: vpmovb2m %zmm5, %k1
-; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4 {%k1}
-; AVX512BW-NEXT: vporq %zmm1, %zmm4, %zmm1
-; AVX512BW-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
+; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v64i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %zmm4, %zmm5, %zmm5
-; AVX512VBMI2-NEXT: vpsllw $5, %zmm5, %zmm5
-; AVX512VBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm6
-; AVX512VBMI2-NEXT: vpmovb2m %zmm6, %k1
-; AVX512VBMI2-NEXT: vpmovb2m %zmm5, %k2
-; AVX512VBMI2-NEXT: vpsrlw $4, %zmm1, %zmm5
-; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm1 {%k2}
-; AVX512VBMI2-NEXT: vpsrlw $2, %zmm1, %zmm5
-; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm1 {%k1}
-; AVX512VBMI2-NEXT: vpsrlw $1, %zmm1, %zmm5
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT: vpsubb %zmm2, %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpsllw $5, %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm4
+; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VBMI2-NEXT: vpmovb2m %zmm3, %k2
+; AVX512VBMI2-NEXT: vpsrlw $4, %zmm1, %zmm3
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k2}
+; AVX512VBMI2-NEXT: vpsrlw $2, %zmm1, %zmm3
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
+; AVX512VBMI2-NEXT: vpsrlw $1, %zmm1, %zmm3
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
+; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpsllw $5, %zmm2, %zmm4
+; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
+; AVX512VBMI2-NEXT: vpsllw $2, %zmm3, %zmm5
; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VBMI2-NEXT: vpaddb %zmm6, %zmm6, %zmm6
-; AVX512VBMI2-NEXT: vpmovb2m %zmm6, %k1
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm1 {%k1}
-; AVX512VBMI2-NEXT: vpsllw $5, %zmm4, %zmm4
-; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm5
-; AVX512VBMI2-NEXT: vpmovb2m %zmm5, %k1
-; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k2
-; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm4
-; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
-; AVX512VBMI2-NEXT: vpblendmb %zmm4, %zmm0, %zmm4 {%k2}
-; AVX512VBMI2-NEXT: vpsllw $2, %zmm4, %zmm6
-; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm6, %zmm6
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm6, %zmm4 {%k1}
-; AVX512VBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm5
-; AVX512VBMI2-NEXT: vpmovb2m %zmm5, %k1
-; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4 {%k1}
-; AVX512VBMI2-NEXT: vporq %zmm1, %zmm4, %zmm1
-; AVX512VBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
+; AVX512VBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v64i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %zmm4, %zmm5, %zmm5
-; AVX512VLBW-NEXT: vpsllw $5, %zmm5, %zmm5
-; AVX512VLBW-NEXT: vpaddb %zmm5, %zmm5, %zmm6
-; AVX512VLBW-NEXT: vpmovb2m %zmm6, %k1
-; AVX512VLBW-NEXT: vpmovb2m %zmm5, %k2
-; AVX512VLBW-NEXT: vpsrlw $4, %zmm1, %zmm5
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm1 {%k2}
-; AVX512VLBW-NEXT: vpsrlw $2, %zmm1, %zmm5
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm1 {%k1}
-; AVX512VLBW-NEXT: vpsrlw $1, %zmm1, %zmm5
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT: vpsubb %zmm2, %zmm3, %zmm3
+; AVX512VLBW-NEXT: vpsllw $5, %zmm3, %zmm3
+; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm4
+; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLBW-NEXT: vpmovb2m %zmm3, %k2
+; AVX512VLBW-NEXT: vpsrlw $4, %zmm1, %zmm3
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k2}
+; AVX512VLBW-NEXT: vpsrlw $2, %zmm1, %zmm3
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
+; AVX512VLBW-NEXT: vpsrlw $1, %zmm1, %zmm3
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
+; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm3
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT: vpsllw $5, %zmm2, %zmm4
+; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLBW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
+; AVX512VLBW-NEXT: vpsllw $2, %zmm3, %zmm5
; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VLBW-NEXT: vpaddb %zmm6, %zmm6, %zmm6
-; AVX512VLBW-NEXT: vpmovb2m %zmm6, %k1
-; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm1 {%k1}
-; AVX512VLBW-NEXT: vpsllw $5, %zmm4, %zmm4
-; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm5
-; AVX512VLBW-NEXT: vpmovb2m %zmm5, %k1
-; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k2
-; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm4
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
-; AVX512VLBW-NEXT: vpblendmb %zmm4, %zmm0, %zmm4 {%k2}
-; AVX512VLBW-NEXT: vpsllw $2, %zmm4, %zmm6
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm6, %zmm6
-; AVX512VLBW-NEXT: vmovdqu8 %zmm6, %zmm4 {%k1}
-; AVX512VLBW-NEXT: vpaddb %zmm5, %zmm5, %zmm5
-; AVX512VLBW-NEXT: vpmovb2m %zmm5, %k1
-; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4 {%k1}
-; AVX512VLBW-NEXT: vporq %zmm1, %zmm4, %zmm1
-; AVX512VLBW-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512VLBW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v64i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %zmm4, %zmm5, %zmm5
-; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm5, %zmm5
-; AVX512VLVBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm6
-; AVX512VLVBMI2-NEXT: vpmovb2m %zmm6, %k1
-; AVX512VLVBMI2-NEXT: vpmovb2m %zmm5, %k2
-; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm1, %zmm5
-; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm1 {%k2}
-; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm1, %zmm5
-; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm1 {%k1}
-; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm1, %zmm5
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT: vpsubb %zmm2, %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm4
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm3, %k2
+; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm1, %zmm3
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k2}
+; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm1, %zmm3
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
+; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm1, %zmm3
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
+; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm2, %zmm4
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLVBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
+; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm3, %zmm5
; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VLVBMI2-NEXT: vpaddb %zmm6, %zmm6, %zmm6
-; AVX512VLVBMI2-NEXT: vpmovb2m %zmm6, %k1
-; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm1 {%k1}
-; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm4, %zmm4
-; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm5
-; AVX512VLVBMI2-NEXT: vpmovb2m %zmm5, %k1
-; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k2
-; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm4
-; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
-; AVX512VLVBMI2-NEXT: vpblendmb %zmm4, %zmm0, %zmm4 {%k2}
-; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm4, %zmm6
-; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm6, %zmm6
-; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm6, %zmm4 {%k1}
-; AVX512VLVBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm5
-; AVX512VLVBMI2-NEXT: vpmovb2m %zmm5, %k1
-; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4 {%k1}
-; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm4, %zmm1
-; AVX512VLVBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
+; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512VLVBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512VLVBMI2-NEXT: retq
@@ -567,14 +553,13 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %
; AVX512F-LABEL: splatvar_funnnel_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastq %xmm2, %zmm2
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512F-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT: vpsllq %xmm4, %zmm0, %zmm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq %xmm2, %zmm0, %zmm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpsrlq %xmm4, %zmm1, %zmm1
-; AVX512F-NEXT: vporq %zmm1, %zmm5, %zmm1
-; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-NEXT: retq
@@ -582,14 +567,13 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %
; AVX512VL-LABEL: splatvar_funnnel_v8i64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastq %xmm2, %zmm2
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512VL-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VL-NEXT: vpsllq %xmm4, %zmm0, %zmm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512VL-NEXT: vpsllq %xmm2, %zmm0, %zmm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpsrlq %xmm4, %zmm1, %zmm1
-; AVX512VL-NEXT: vporq %zmm1, %zmm5, %zmm1
-; AVX512VL-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512VL-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512VL-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512VL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512VL-NEXT: retq
@@ -597,14 +581,13 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %
; AVX512BW-LABEL: splatvar_funnnel_v8i64:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastq %xmm2, %zmm2
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT: vpsllq %xmm4, %zmm0, %zmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512BW-NEXT: vpsllq %xmm2, %zmm0, %zmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm5, %zmm1
-; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
@@ -618,14 +601,13 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %
; AVX512VLBW-LABEL: splatvar_funnnel_v8i64:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %zmm2
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vpsllq %xmm4, %zmm0, %zmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpsllq %xmm2, %zmm0, %zmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT: vporq %zmm1, %zmm5, %zmm1
-; AVX512VLBW-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512VLBW-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512VLBW-NEXT: retq
@@ -644,16 +626,15 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
; AVX512F-LABEL: splatvar_funnnel_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastd %xmm2, %zmm2
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512F-NEXT: vpandd %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512F-NEXT: vpslld %xmm5, %zmm0, %zmm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512F-NEXT: vpslld %xmm3, %zmm0, %zmm3
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512F-NEXT: vpsrld %xmm4, %zmm1, %zmm1
-; AVX512F-NEXT: vpord %zmm1, %zmm5, %zmm1
-; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpord %zmm1, %zmm3, %zmm1
+; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-NEXT: retq
@@ -661,16 +642,15 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
; AVX512VL-LABEL: splatvar_funnnel_v16i32:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastd %xmm2, %zmm2
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512VL-NEXT: vpandd %zmm3, %zmm2, %zmm4
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VL-NEXT: vpslld %xmm5, %zmm0, %zmm5
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VL-NEXT: vpslld %xmm3, %zmm0, %zmm3
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512VL-NEXT: vpsrld %xmm4, %zmm1, %zmm1
-; AVX512VL-NEXT: vpord %zmm1, %zmm5, %zmm1
-; AVX512VL-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512VL-NEXT: vpord %zmm1, %zmm3, %zmm1
+; AVX512VL-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512VL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512VL-NEXT: retq
@@ -678,16 +658,15 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
; AVX512BW-LABEL: splatvar_funnnel_v16i32:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastd %xmm2, %zmm2
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512BW-NEXT: vpandd %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512BW-NEXT: vpslld %xmm5, %zmm0, %zmm5
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512BW-NEXT: vpslld %xmm3, %zmm0, %zmm3
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512BW-NEXT: vpsrld %xmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vpord %zmm1, %zmm5, %zmm1
-; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpord %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
@@ -701,16 +680,15 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
; AVX512VLBW-LABEL: splatvar_funnnel_v16i32:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %zmm2
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512VLBW-NEXT: vpandd %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VLBW-NEXT: vpslld %xmm5, %zmm0, %zmm5
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VLBW-NEXT: vpslld %xmm3, %zmm0, %zmm3
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512VLBW-NEXT: vpsrld %xmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT: vpord %zmm1, %zmm5, %zmm1
-; AVX512VLBW-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT: vpord %zmm1, %zmm3, %zmm1
+; AVX512VLBW-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512VLBW-NEXT: retq
@@ -769,16 +747,15 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
; AVX512BW-LABEL: splatvar_funnnel_v32i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastw %xmm2, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512BW-NEXT: vpsllw %xmm5, %zmm0, %zmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512BW-NEXT: vpsllw %xmm3, %zmm0, %zmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm5, %zmm1
-; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
@@ -792,16 +769,15 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
; AVX512VLBW-LABEL: splatvar_funnnel_v32i16:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %zmm2
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VLBW-NEXT: vpsllw %xmm5, %zmm0, %zmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm0, %zmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT: vporq %zmm1, %zmm5, %zmm1
-; AVX512VLBW-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512VLBW-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512VLBW-NEXT: retq
@@ -880,24 +856,23 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
; AVX512BW-LABEL: splatvar_funnnel_v64i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpsllw %xmm5, %zmm0, %zmm6
-; AVX512BW-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7
-; AVX512BW-NEXT: vpsllw %xmm5, %zmm7, %zmm5
-; AVX512BW-NEXT: vpbroadcastb %xmm5, %zmm5
-; AVX512BW-NEXT: vpandq %zmm5, %zmm6, %zmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpsllw %xmm3, %zmm0, %zmm4
+; AVX512BW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5
+; AVX512BW-NEXT: vpsllw %xmm3, %zmm5, %zmm3
+; AVX512BW-NEXT: vpbroadcastb %xmm3, %zmm3
+; AVX512BW-NEXT: vpandq %zmm3, %zmm4, %zmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vpsrlw %xmm4, %zmm7, %zmm4
+; AVX512BW-NEXT: vpsrlw %xmm4, %zmm5, %zmm4
; AVX512BW-NEXT: vpsrlw $8, %zmm4, %zmm4
; AVX512BW-NEXT: vpbroadcastb %xmm4, %zmm4
; AVX512BW-NEXT: vpandq %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm5, %zmm1
-; AVX512BW-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
@@ -905,24 +880,23 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8:
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %zmm2
-; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VBMI2-NEXT: vpsllw %xmm5, %zmm0, %zmm6
-; AVX512VBMI2-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7
-; AVX512VBMI2-NEXT: vpsllw %xmm5, %zmm7, %zmm5
-; AVX512VBMI2-NEXT: vpbroadcastb %xmm5, %zmm5
-; AVX512VBMI2-NEXT: vpandq %zmm5, %zmm6, %zmm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm4
+; AVX512VBMI2-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5
+; AVX512VBMI2-NEXT: vpsllw %xmm3, %zmm5, %zmm3
+; AVX512VBMI2-NEXT: vpbroadcastb %xmm3, %zmm3
+; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
-; AVX512VBMI2-NEXT: vpsrlw %xmm4, %zmm7, %zmm4
+; AVX512VBMI2-NEXT: vpsrlw %xmm4, %zmm5, %zmm4
; AVX512VBMI2-NEXT: vpsrlw $8, %zmm4, %zmm4
; AVX512VBMI2-NEXT: vpbroadcastb %xmm4, %zmm4
; AVX512VBMI2-NEXT: vpandq %zmm4, %zmm1, %zmm1
-; AVX512VBMI2-NEXT: vporq %zmm1, %zmm5, %zmm1
-; AVX512VBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512VBMI2-NEXT: retq
@@ -930,24 +904,23 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
; AVX512VLBW-LABEL: splatvar_funnnel_v64i8:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLBW-NEXT: vpsllw %xmm5, %zmm0, %zmm6
-; AVX512VLBW-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7
-; AVX512VLBW-NEXT: vpsllw %xmm5, %zmm7, %zmm5
-; AVX512VLBW-NEXT: vpbroadcastb %xmm5, %zmm5
-; AVX512VLBW-NEXT: vpandq %zmm5, %zmm6, %zmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm0, %zmm4
+; AVX512VLBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5
+; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm5, %zmm3
+; AVX512VLBW-NEXT: vpbroadcastb %xmm3, %zmm3
+; AVX512VLBW-NEXT: vpandq %zmm3, %zmm4, %zmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm7, %zmm4
+; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm5, %zmm4
; AVX512VLBW-NEXT: vpsrlw $8, %zmm4, %zmm4
; AVX512VLBW-NEXT: vpbroadcastb %xmm4, %zmm4
; AVX512VLBW-NEXT: vpandq %zmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT: vporq %zmm1, %zmm5, %zmm1
-; AVX512VLBW-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512VLBW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512VLBW-NEXT: retq
@@ -955,24 +928,23 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v64i8:
; AVX512VLVBMI2: # %bb.0:
; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %zmm2
-; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLVBMI2-NEXT: vpsllw %xmm5, %zmm0, %zmm6
-; AVX512VLVBMI2-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7
-; AVX512VLVBMI2-NEXT: vpsllw %xmm5, %zmm7, %zmm5
-; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm5, %zmm5
-; AVX512VLVBMI2-NEXT: vpandq %zmm5, %zmm6, %zmm5
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm4
+; AVX512VLVBMI2-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5
+; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %zmm5, %zmm3
+; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
-; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %zmm7, %zmm4
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %zmm5, %zmm4
; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm4, %zmm4
; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm4, %zmm4
; AVX512VLVBMI2-NEXT: vpandq %zmm4, %zmm1, %zmm1
-; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm5, %zmm1
-; AVX512VLVBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512VLVBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512VLVBMI2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index 1f70fc95277..7c31f9c296e 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -110,16 +110,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
;
; AVX512F-LABEL: var_funnnel_v2i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
-; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512F-NEXT: vzeroupper
@@ -127,29 +125,26 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
;
; AVX512VL-LABEL: var_funnnel_v2i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VL-NEXT: vpsrlvq %xmm4, %xmm1, %xmm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512VL-NEXT: vptestnmq %xmm3, %xmm2, %k1
+; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vptestnmq %xmm2, %xmm2, %k1
; AVX512VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v2i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
-; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@@ -157,16 +152,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
;
; AVX512VBMI2-LABEL: var_funnnel_v2i64:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VBMI2-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI2-NEXT: vzeroupper
@@ -174,14 +167,13 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
;
; AVX512VLBW-LABEL: var_funnnel_v2i64:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vptestnmq %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vptestnmq %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -363,16 +355,15 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
;
; AVX512F-LABEL: var_funnnel_v4i32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512F-NEXT: vpsrlvd %xmm4, %xmm1, %xmm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
-; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512F-NEXT: vzeroupper
@@ -380,29 +371,27 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
;
; AVX512VL-LABEL: var_funnnel_v4i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VL-NEXT: vpsrlvd %xmm4, %xmm1, %xmm5
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
+; AVX512VL-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512VL-NEXT: vptestnmd %xmm3, %xmm2, %k1
+; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vptestnmd %xmm2, %xmm2, %k1
; AVX512VL-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v4i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm5
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
-; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@@ -410,16 +399,15 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
;
; AVX512VBMI2-LABEL: var_funnnel_v4i32:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm5
-; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VBMI2-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3
+; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI2-NEXT: vzeroupper
@@ -427,14 +415,13 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
;
; AVX512VLBW-LABEL: var_funnnel_v4i32:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm5
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vptestnmd %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vptestnmd %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -727,17 +714,15 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
;
; AVX512BW-LABEL: var_funnnel_v8i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@@ -745,17 +730,15 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
;
; AVX512VBMI2-LABEL: var_funnnel_v8i16:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VBMI2-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm1, %zmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI2-NEXT: vzeroupper
@@ -763,14 +746,13 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
;
; AVX512VLBW-LABEL: var_funnnel_v8i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT: vpsrlvw %xmm4, %xmm1, %xmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpsrlvw %xmm2, %xmm1, %xmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpsllvw %xmm4, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vptestnmw %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vptestnmw %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -1056,21 +1038,19 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
;
; AVX512BW-LABEL: var_funnnel_v16i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm5
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512BW-NEXT: vpsrlvw %zmm6, %zmm3, %zmm3
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %xmm5, %xmm6, %xmm5
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512BW-NEXT: vpsllvw %zmm5, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: vptestnmb %zmm4, %zmm2, %k1
+; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@@ -1078,21 +1058,19 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
;
; AVX512VBMI2-LABEL: var_funnnel_v16i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT: vpand %xmm4, %xmm2, %xmm5
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512VBMI2-NEXT: vpsrlvw %zmm6, %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %xmm5, %xmm6, %xmm5
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512VBMI2-NEXT: vpsllvw %zmm5, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512VBMI2-NEXT: vptestnmb %zmm4, %zmm2, %k1
+; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI2-NEXT: vzeroupper
@@ -1100,38 +1078,36 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
;
; AVX512VLBW-LABEL: var_funnnel_v16i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VLBW-NEXT: vpsrlvw %ymm5, %ymm6, %ymm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm3, %ymm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512VLBW-NEXT: vptestnmb %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT: vptestnmb %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v16i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VLVBMI2-NEXT: vpsrlvw %ymm5, %ymm6, %ymm5
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLVBMI2-NEXT: vpsrlvw %ymm4, %ymm3, %ymm3
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512VLVBMI2-NEXT: vpsllvw %ymm4, %ymm0, %ymm0
-; AVX512VLVBMI2-NEXT: vpor %ymm5, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512VLVBMI2-NEXT: vptestnmb %xmm3, %xmm2, %k1
+; AVX512VLVBMI2-NEXT: vptestnmb %xmm2, %xmm2, %k1
; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
; AVX512VLVBMI2-NEXT: vzeroupper
; AVX512VLVBMI2-NEXT: retq
@@ -1294,14 +1270,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpsllq %xmm4, %xmm0, %xmm0
-; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512F-NEXT: vzeroupper
@@ -1310,14 +1285,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
; AVX512VL-LABEL: splatvar_funnnel_v2i64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VL-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpsllq %xmm4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512VL-NEXT: vptestnmq %xmm3, %xmm2, %k1
+; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vptestnmq %xmm2, %xmm2, %k1
; AVX512VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: retq
;
@@ -1325,14 +1299,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpsllq %xmm4, %xmm0, %xmm0
-; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@@ -1342,14 +1315,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VBMI2-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpsllq %xmm4, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI2-NEXT: vzeroupper
@@ -1358,14 +1330,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
; AVX512VLBW-LABEL: splatvar_funnnel_v2i64:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpsllq %xmm4, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vptestnmq %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vptestnmq %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -1516,15 +1487,15 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512F-NEXT: vpbroadcastd %xmm2, %xmm2
; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512F-NEXT: vpsrld %xmm5, %xmm1, %xmm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512F-NEXT: vpsrld %xmm3, %xmm1, %xmm3
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512F-NEXT: vpslld %xmm4, %xmm0, %xmm0
-; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512F-NEXT: vzeroupper
@@ -1533,16 +1504,15 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
; AVX512VL-LABEL: splatvar_funnnel_v4i32:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VL-NEXT: vpsrld %xmm5, %xmm1, %xmm5
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VL-NEXT: vpsrld %xmm3, %xmm1, %xmm3
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512VL-NEXT: vpslld %xmm4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512VL-NEXT: vptestnmd %xmm3, %xmm2, %k1
+; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vptestnmd %xmm2, %xmm2, %k1
; AVX512VL-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: retq
;
@@ -1551,15 +1521,15 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: vpbroadcastd %xmm2, %xmm2
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512BW-NEXT: vpsrld %xmm5, %xmm1, %xmm5
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512BW-NEXT: vpsrld %xmm3, %xmm1, %xmm3
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512BW-NEXT: vpslld %xmm4, %xmm0, %xmm0
-; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@@ -1570,15 +1540,15 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %xmm2
; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VBMI2-NEXT: vpsrld %xmm5, %xmm1, %xmm5
-; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VBMI2-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VBMI2-NEXT: vpsrld %xmm3, %xmm1, %xmm3
+; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512VBMI2-NEXT: vpslld %xmm4, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI2-NEXT: vzeroupper
@@ -1587,16 +1557,15 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
; AVX512VLBW-LABEL: splatvar_funnnel_v4i32:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VLBW-NEXT: vpsrld %xmm5, %xmm1, %xmm5
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VLBW-NEXT: vpsrld %xmm3, %xmm1, %xmm3
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512VLBW-NEXT: vpslld %xmm4, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vptestnmd %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vptestnmd %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -1779,16 +1748,15 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512BW-NEXT: vpsrlw %xmm5, %xmm1, %xmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512BW-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512BW-NEXT: vpsllw %xmm4, %xmm0, %xmm0
-; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@@ -1798,16 +1766,15 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VBMI2-NEXT: vpsrlw %xmm5, %xmm1, %xmm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VBMI2-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VBMI2-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512VBMI2-NEXT: vpsllw %xmm4, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI2-NEXT: vzeroupper
@@ -1816,16 +1783,15 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VLBW-NEXT: vpsrlw %xmm5, %xmm1, %xmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512VLBW-NEXT: vpsllw %xmm4, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vptestnmw %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vptestnmw %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -2053,18 +2019,17 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm5
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512BW-NEXT: vpsrlvw %zmm6, %zmm3, %zmm3
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %xmm5, %xmm6, %xmm5
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512BW-NEXT: vpsllvw %zmm5, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: vptestnmb %zmm4, %zmm2, %k1
+; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@@ -2075,18 +2040,17 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT: vpand %xmm4, %xmm2, %xmm5
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512VBMI2-NEXT: vpsrlvw %zmm6, %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %xmm5, %xmm6, %xmm5
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512VBMI2-NEXT: vpsllvw %zmm5, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512VBMI2-NEXT: vptestnmb %zmm4, %zmm2, %k1
+; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI2-NEXT: vzeroupper
@@ -2095,19 +2059,18 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
; AVX512VLBW-LABEL: splatvar_funnnel_v16i8:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VLBW-NEXT: vpsrlvw %ymm5, %ymm6, %ymm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm3, %ymm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512VLBW-NEXT: vptestnmb %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT: vptestnmb %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
@@ -2115,19 +2078,18 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8:
; AVX512VLVBMI2: # %bb.0:
; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm2
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VLVBMI2-NEXT: vpsrlvw %ymm5, %ymm6, %ymm5
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLVBMI2-NEXT: vpsrlvw %ymm4, %ymm3, %ymm3
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512VLVBMI2-NEXT: vpsllvw %ymm4, %ymm0, %ymm0
-; AVX512VLVBMI2-NEXT: vpor %ymm5, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512VLVBMI2-NEXT: vptestnmb %xmm3, %xmm2, %k1
+; AVX512VLVBMI2-NEXT: vptestnmb %xmm2, %xmm2, %k1
; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
; AVX512VLVBMI2-NEXT: vzeroupper
; AVX512VLVBMI2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index 64283a6603b..1e55383a492 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -71,75 +71,70 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt)
;
; AVX512F-LABEL: var_funnnel_v4i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT: vpsrlvq %ymm4, %ymm1, %ymm5
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
-; AVX512F-NEXT: vpsubq %ymm4, %ymm6, %ymm4
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlvq %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; AVX512F-NEXT: vpsubq %ymm2, %ymm4, %ymm4
; AVX512F-NEXT: vpsllvq %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v4i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VL-NEXT: vpsrlvq %ymm4, %ymm1, %ymm5
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
-; AVX512VL-NEXT: vpsubq %ymm4, %ymm6, %ymm4
+; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsrlvq %ymm2, %ymm1, %ymm3
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; AVX512VL-NEXT: vpsubq %ymm2, %ymm4, %ymm4
; AVX512VL-NEXT: vpsllvq %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VL-NEXT: vptestnmq %ymm3, %ymm2, %k1
+; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vptestnmq %ymm2, %ymm2, %k1
; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1}
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v4i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT: vpsrlvq %ymm4, %ymm1, %ymm5
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
-; AVX512BW-NEXT: vpsubq %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT: vpsrlvq %ymm2, %ymm1, %ymm3
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; AVX512BW-NEXT: vpsubq %ymm2, %ymm4, %ymm4
; AVX512BW-NEXT: vpsllvq %ymm4, %ymm0, %ymm0
-; AVX512BW-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v4i64:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT: vpsrlvq %ymm4, %ymm1, %ymm5
-; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
-; AVX512VBMI2-NEXT: vpsubq %ymm4, %ymm6, %ymm4
+; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpsrlvq %ymm2, %ymm1, %ymm3
+; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; AVX512VBMI2-NEXT: vpsubq %ymm2, %ymm4, %ymm4
; AVX512VBMI2-NEXT: vpsllvq %ymm4, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v4i64:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT: vpsrlvq %ymm4, %ymm1, %ymm5
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
-; AVX512VLBW-NEXT: vpsubq %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpsrlvq %ymm2, %ymm1, %ymm3
+; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; AVX512VLBW-NEXT: vpsubq %ymm2, %ymm4, %ymm4
; AVX512VLBW-NEXT: vpsllvq %ymm4, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vptestnmq %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vptestnmq %ymm2, %ymm2, %k1
; AVX512VLBW-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -258,75 +253,70 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt)
;
; AVX512F-LABEL: var_funnnel_v8i32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT: vpsrlvd %ymm4, %ymm1, %ymm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
-; AVX512F-NEXT: vpsubd %ymm4, %ymm6, %ymm4
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlvd %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; AVX512F-NEXT: vpsubd %ymm2, %ymm4, %ymm4
; AVX512F-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v8i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VL-NEXT: vpsrlvd %ymm4, %ymm1, %ymm5
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
-; AVX512VL-NEXT: vpsubd %ymm4, %ymm6, %ymm4
+; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsrlvd %ymm2, %ymm1, %ymm3
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; AVX512VL-NEXT: vpsubd %ymm2, %ymm4, %ymm4
; AVX512VL-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VL-NEXT: vptestnmd %ymm3, %ymm2, %k1
+; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm2, %ymm2, %k1
; AVX512VL-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1}
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v8i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT: vpsrlvd %ymm4, %ymm1, %ymm5
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
-; AVX512BW-NEXT: vpsubd %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT: vpsrlvd %ymm2, %ymm1, %ymm3
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT: vpsubd %ymm2, %ymm4, %ymm4
; AVX512BW-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
-; AVX512BW-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v8i32:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm5
-; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
-; AVX512VBMI2-NEXT: vpsubd %ymm4, %ymm6, %ymm4
+; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpsrlvd %ymm2, %ymm1, %ymm3
+; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; AVX512VBMI2-NEXT: vpsubd %ymm2, %ymm4, %ymm4
; AVX512VBMI2-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v8i32:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT: vpsrlvd %ymm4, %ymm1, %ymm5
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpsrlvd %ymm2, %ymm1, %ymm3
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; AVX512VLBW-NEXT: vpsubd %ymm2, %ymm4, %ymm4
; AVX512VLBW-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vptestnmd %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vptestnmd %ymm2, %ymm2, %k1
; AVX512VLBW-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -516,48 +506,43 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
;
; AVX512BW-LABEL: var_funnnel_v16i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpsubw %ymm2, %ymm4, %ymm4
; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v16i16:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VBMI2-NEXT: vpsubw %ymm4, %ymm6, %ymm4
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm1, %zmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VBMI2-NEXT: vpsubw %ymm2, %ymm4, %ymm4
; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v16i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpsrlvw %ymm2, %ymm1, %ymm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpsubw %ymm2, %ymm4, %ymm4
; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vptestnmw %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vptestnmw %ymm2, %ymm2, %k1
; AVX512VLBW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -796,79 +781,73 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
;
; AVX512BW-LABEL: var_funnnel_v32i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512BW-NEXT: vpsrlvw %zmm5, %zmm6, %zmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpsubb %ymm2, %ymm4, %ymm4
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vporq %zmm5, %zmm0, %zmm0
+; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v32i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VBMI2-NEXT: vpsrlvw %zmm5, %zmm6, %zmm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT: vporq %zmm5, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0
; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512VBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v32i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VLBW-NEXT: vpsrlvw %zmm5, %zmm6, %zmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT: vpsubb %ymm2, %ymm4, %ymm4
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512VLBW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vporq %zmm5, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512VLBW-NEXT: vptestnmb %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT: vptestnmb %ymm2, %ymm2, %k1
; AVX512VLBW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v32i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VLVBMI2-NEXT: vpsrlvw %zmm5, %zmm6, %zmm5
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLVBMI2-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512VLVBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT: vporq %zmm5, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0
; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512VLVBMI2-NEXT: vptestnmb %ymm3, %ymm2, %k1
+; AVX512VLVBMI2-NEXT: vptestnmb %ymm2, %ymm2, %k1
; AVX512VLVBMI2-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
; AVX512VLVBMI2-NEXT: retq
;
@@ -974,13 +953,13 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2
; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT: vpsrlq %xmm4, %ymm1, %ymm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlq %xmm2, %ymm1, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpsllq %xmm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
@@ -988,14 +967,13 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
; AVX512VL-LABEL: splatvar_funnnel_v4i64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastq %xmm2, %ymm2
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VL-NEXT: vpsrlq %xmm4, %ymm1, %ymm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsrlq %xmm2, %ymm1, %ymm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpsllq %xmm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VL-NEXT: vptestnmq %ymm3, %ymm2, %k1
+; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vptestnmq %ymm2, %ymm2, %k1
; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1}
; AVX512VL-NEXT: retq
;
@@ -1004,13 +982,13 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-NEXT: vpbroadcastq %xmm2, %ymm2
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT: vpsrlq %xmm4, %ymm1, %ymm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT: vpsrlq %xmm2, %ymm1, %ymm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpsllq %xmm4, %ymm0, %ymm0
-; AVX512BW-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512BW-NEXT: retq
@@ -1020,13 +998,13 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %ymm2
; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT: vpsrlq %xmm4, %ymm1, %ymm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VBMI2-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpsrlq %xmm2, %ymm1, %ymm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpsllq %xmm4, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512VBMI2-NEXT: retq
@@ -1034,14 +1012,13 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
; AVX512VLBW-LABEL: splatvar_funnnel_v4i64:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %ymm2
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT: vpsrlq %xmm4, %ymm1, %ymm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpsrlq %xmm2, %ymm1, %ymm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpsllq %xmm4, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vptestnmq %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vptestnmq %ymm2, %ymm2, %k1
; AVX512VLBW-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -1147,15 +1124,15 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512F-NEXT: vpbroadcastd %xmm2, %ymm2
; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512F-NEXT: vpsrld %xmm5, %ymm1, %ymm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512F-NEXT: vpsrld %xmm3, %ymm1, %ymm3
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512F-NEXT: vpslld %xmm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
@@ -1163,16 +1140,15 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
; AVX512VL-LABEL: splatvar_funnnel_v8i32:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastd %xmm2, %ymm2
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VL-NEXT: vpsrld %xmm5, %ymm1, %ymm5
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VL-NEXT: vpsrld %xmm3, %ymm1, %ymm3
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512VL-NEXT: vpslld %xmm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VL-NEXT: vptestnmd %ymm3, %ymm2, %k1
+; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm2, %ymm2, %k1
; AVX512VL-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1}
; AVX512VL-NEXT: retq
;
@@ -1181,15 +1157,15 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-NEXT: vpbroadcastd %xmm2, %ymm2
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512BW-NEXT: vpsrld %xmm5, %ymm1, %ymm5
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512BW-NEXT: vpsrld %xmm3, %ymm1, %ymm3
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512BW-NEXT: vpslld %xmm4, %ymm0, %ymm0
-; AVX512BW-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512BW-NEXT: retq
@@ -1199,15 +1175,15 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %ymm2
; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VBMI2-NEXT: vpsrld %xmm5, %ymm1, %ymm5
-; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VBMI2-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VBMI2-NEXT: vpsrld %xmm3, %ymm1, %ymm3
+; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512VBMI2-NEXT: vpslld %xmm4, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512VBMI2-NEXT: retq
@@ -1215,16 +1191,15 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
; AVX512VLBW-LABEL: splatvar_funnnel_v8i32:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %ymm2
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VLBW-NEXT: vpsrld %xmm5, %ymm1, %ymm5
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VLBW-NEXT: vpsrld %xmm3, %ymm1, %ymm3
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512VLBW-NEXT: vpslld %xmm4, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vptestnmd %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vptestnmd %ymm2, %ymm2, %k1
; AVX512VLBW-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -1368,16 +1343,15 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512BW-NEXT: vpsrlw %xmm5, %ymm1, %ymm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512BW-NEXT: vpsrlw %xmm3, %ymm1, %ymm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512BW-NEXT: vpsllw %xmm4, %ymm0, %ymm0
-; AVX512BW-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512BW-NEXT: retq
@@ -1386,16 +1360,15 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VBMI2-NEXT: vpsrlw %xmm5, %ymm1, %ymm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VBMI2-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VBMI2-NEXT: vpsrlw %xmm3, %ymm1, %ymm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512VBMI2-NEXT: vpsllw %xmm4, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512VBMI2-NEXT: retq
@@ -1403,16 +1376,15 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
; AVX512VLBW-LABEL: splatvar_funnnel_v16i16:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VLBW-NEXT: vpsrlw %xmm5, %ymm1, %ymm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm1, %ymm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512VLBW-NEXT: vpsllw %xmm4, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vptestnmw %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vptestnmw %ymm2, %ymm2, %k1
; AVX512VLBW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -1590,19 +1562,18 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512BW-NEXT: vpsrlvw %zmm5, %zmm6, %zmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpsubb %ymm2, %ymm4, %ymm4
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vporq %zmm5, %zmm0, %zmm0
+; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512BW-NEXT: retq
@@ -1611,19 +1582,18 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VBMI2-NEXT: vpsrlvw %zmm5, %zmm6, %zmm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT: vporq %zmm5, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0
; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512VBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512VBMI2-NEXT: retq
@@ -1631,38 +1601,36 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
; AVX512VLBW-LABEL: splatvar_funnnel_v32i8:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VLBW-NEXT: vpsrlvw %zmm5, %zmm6, %zmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT: vpsubb %ymm2, %ymm4, %ymm4
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512VLBW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vporq %zmm5, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512VLBW-NEXT: vptestnmb %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT: vptestnmb %ymm2, %ymm2, %k1
; AVX512VLBW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8:
; AVX512VLVBMI2: # %bb.0:
; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VLVBMI2-NEXT: vpsrlvw %zmm5, %zmm6, %zmm5
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLVBMI2-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512VLVBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT: vporq %zmm5, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0
; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512VLVBMI2-NEXT: vptestnmb %ymm3, %ymm2, %k1
+; AVX512VLVBMI2-NEXT: vptestnmb %ymm2, %ymm2, %k1
; AVX512VLVBMI2-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
; AVX512VLVBMI2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll
index 063fd038f22..dd469fba1e0 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll
@@ -18,40 +18,37 @@ declare <64 x i8> @llvm.fshr.v64i8(<64 x i8>, <64 x i8>, <64 x i8>)
define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v8i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512F-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT: vpsrlvq %zmm4, %zmm1, %zmm5
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64]
-; AVX512F-NEXT: vpsubq %zmm4, %zmm6, %zmm4
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlvq %zmm2, %zmm1, %zmm3
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
+; AVX512F-NEXT: vpsubq %zmm2, %zmm4, %zmm4
; AVX512F-NEXT: vpsllvq %zmm4, %zmm0, %zmm0
-; AVX512F-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v8i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512VL-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VL-NEXT: vpsrlvq %zmm4, %zmm1, %zmm5
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64]
-; AVX512VL-NEXT: vpsubq %zmm4, %zmm6, %zmm4
+; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512VL-NEXT: vpsrlvq %zmm2, %zmm1, %zmm3
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
+; AVX512VL-NEXT: vpsubq %zmm2, %zmm4, %zmm4
; AVX512VL-NEXT: vpsllvq %zmm4, %zmm0, %zmm0
-; AVX512VL-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512VL-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT: vpsrlvq %zmm4, %zmm1, %zmm5
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64]
-; AVX512BW-NEXT: vpsubq %zmm4, %zmm6, %zmm4
+; AVX512BW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512BW-NEXT: vpsrlvq %zmm2, %zmm1, %zmm3
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
+; AVX512BW-NEXT: vpsubq %zmm2, %zmm4, %zmm4
; AVX512BW-NEXT: vpsllvq %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: retq
;
@@ -63,14 +60,13 @@ define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt)
;
; AVX512VLBW-LABEL: var_funnnel_v8i64:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vpsrlvq %zmm4, %zmm1, %zmm5
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64]
-; AVX512VLBW-NEXT: vpsubq %zmm4, %zmm6, %zmm4
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpsrlvq %zmm2, %zmm1, %zmm3
+; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
+; AVX512VLBW-NEXT: vpsubq %zmm2, %zmm4, %zmm4
; AVX512VLBW-NEXT: vpsllvq %zmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -86,40 +82,37 @@ define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt)
define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v16i32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512F-NEXT: vpandd %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512F-NEXT: vpsubd %zmm4, %zmm6, %zmm4
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlvd %zmm2, %zmm1, %zmm3
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512F-NEXT: vpsubd %zmm2, %zmm4, %zmm4
; AVX512F-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
-; AVX512F-NEXT: vpord %zmm5, %zmm0, %zmm0
-; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpord %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v16i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512VL-NEXT: vpandd %zmm3, %zmm2, %zmm4
-; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm5
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512VL-NEXT: vpsubd %zmm4, %zmm6, %zmm4
+; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm1, %zmm3
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512VL-NEXT: vpsubd %zmm2, %zmm4, %zmm4
; AVX512VL-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
-; AVX512VL-NEXT: vpord %zmm5, %zmm0, %zmm0
-; AVX512VL-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512VL-NEXT: vpord %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512VL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512BW-NEXT: vpandd %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT: vpsrlvd %zmm4, %zmm1, %zmm5
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512BW-NEXT: vpsubd %zmm4, %zmm6, %zmm4
+; AVX512BW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512BW-NEXT: vpsrlvd %zmm2, %zmm1, %zmm3
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT: vpsubd %zmm2, %zmm4, %zmm4
; AVX512BW-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpord %zmm5, %zmm0, %zmm0
-; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpord %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: retq
;
@@ -131,14 +124,13 @@ define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %
;
; AVX512VLBW-LABEL: var_funnnel_v16i32:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512VLBW-NEXT: vpandd %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vpsrlvd %zmm4, %zmm1, %zmm5
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %zmm4, %zmm6, %zmm4
+; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpsrlvd %zmm2, %zmm1, %zmm3
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512VLBW-NEXT: vpsubd %zmm2, %zmm4, %zmm4
; AVX512VLBW-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vpord %zmm5, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT: vpord %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -216,14 +208,13 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %
;
; AVX512BW-LABEL: var_funnnel_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm5
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %zmm4, %zmm6, %zmm4
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm3
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpsubw %zmm2, %zmm4, %zmm4
; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: retq
;
@@ -235,14 +226,13 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %
;
; AVX512VLBW-LABEL: var_funnnel_v32i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm5
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %zmm4, %zmm6, %zmm4
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm3
+; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpsubw %zmm2, %zmm4, %zmm4
; AVX512VLBW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -394,28 +384,27 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
;
; AVX512BW-LABEL: var_funnnel_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT: vpsllw $5, %zmm4, %zmm5
-; AVX512BW-NEXT: vpaddb %zmm5, %zmm5, %zmm6
-; AVX512BW-NEXT: vpmovb2m %zmm6, %k1
-; AVX512BW-NEXT: vpmovb2m %zmm5, %k2
-; AVX512BW-NEXT: vpsrlw $4, %zmm1, %zmm5
+; AVX512BW-NEXT: vpsrlw $4, %zmm1, %zmm3
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vpsllw $5, %zmm2, %zmm4
+; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT: vpblendmb %zmm3, %zmm1, %zmm3 {%k1}
+; AVX512BW-NEXT: vpsrlw $2, %zmm3, %zmm5
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512BW-NEXT: vpsrlw $1, %zmm3, %zmm5
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512BW-NEXT: vpblendmb %zmm5, %zmm1, %zmm5 {%k2}
-; AVX512BW-NEXT: vpsrlw $2, %zmm5, %zmm7
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm7, %zmm7
-; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1}
-; AVX512BW-NEXT: vpsrlw $1, %zmm5, %zmm7
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm7, %zmm7
-; AVX512BW-NEXT: vpaddb %zmm6, %zmm6, %zmm6
-; AVX512BW-NEXT: vpmovb2m %zmm6, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1}
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %zmm4, %zmm6, %zmm4
+; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpsubb %zmm2, %zmm4, %zmm4
; AVX512BW-NEXT: vpsllw $5, %zmm4, %zmm4
-; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm6
-; AVX512BW-NEXT: vpmovb2m %zmm6, %k1
+; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm5
+; AVX512BW-NEXT: vpmovb2m %zmm5, %k1
; AVX512BW-NEXT: vpmovb2m %zmm4, %k2
; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm4
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
@@ -423,38 +412,37 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm4
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm0 {%k1}
-; AVX512BW-NEXT: vpaddb %zmm6, %zmm6, %zmm4
+; AVX512BW-NEXT: vpaddb %zmm5, %zmm5, %zmm4
; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
-; AVX512BW-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512BW-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v64i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VBMI2-NEXT: vpsllw $5, %zmm4, %zmm5
-; AVX512VBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm6
-; AVX512VBMI2-NEXT: vpmovb2m %zmm6, %k1
-; AVX512VBMI2-NEXT: vpmovb2m %zmm5, %k2
-; AVX512VBMI2-NEXT: vpsrlw $4, %zmm1, %zmm5
+; AVX512VBMI2-NEXT: vpsrlw $4, %zmm1, %zmm3
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VBMI2-NEXT: vpsllw $5, %zmm2, %zmm4
+; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VBMI2-NEXT: vpblendmb %zmm3, %zmm1, %zmm3 {%k1}
+; AVX512VBMI2-NEXT: vpsrlw $2, %zmm3, %zmm5
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VBMI2-NEXT: vpsrlw $1, %zmm3, %zmm5
; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VBMI2-NEXT: vpblendmb %zmm5, %zmm1, %zmm5 {%k2}
-; AVX512VBMI2-NEXT: vpsrlw $2, %zmm5, %zmm7
-; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm7, %zmm7
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1}
-; AVX512VBMI2-NEXT: vpsrlw $1, %zmm5, %zmm7
-; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm7, %zmm7
-; AVX512VBMI2-NEXT: vpaddb %zmm6, %zmm6, %zmm6
-; AVX512VBMI2-NEXT: vpmovb2m %zmm6, %k1
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1}
-; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %zmm4, %zmm6, %zmm4
+; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT: vpsubb %zmm2, %zmm4, %zmm4
; AVX512VBMI2-NEXT: vpsllw $5, %zmm4, %zmm4
-; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm6
-; AVX512VBMI2-NEXT: vpmovb2m %zmm6, %k1
+; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm5
+; AVX512VBMI2-NEXT: vpmovb2m %zmm5, %k1
; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k2
; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm4
; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
@@ -462,38 +450,37 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
; AVX512VBMI2-NEXT: vpsllw $2, %zmm0, %zmm4
; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
; AVX512VBMI2-NEXT: vmovdqu8 %zmm4, %zmm0 {%k1}
-; AVX512VBMI2-NEXT: vpaddb %zmm6, %zmm6, %zmm4
+; AVX512VBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm4
; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
; AVX512VBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
-; AVX512VBMI2-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512VBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v64i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vpsllw $5, %zmm4, %zmm5
-; AVX512VLBW-NEXT: vpaddb %zmm5, %zmm5, %zmm6
-; AVX512VLBW-NEXT: vpmovb2m %zmm6, %k1
-; AVX512VLBW-NEXT: vpmovb2m %zmm5, %k2
-; AVX512VLBW-NEXT: vpsrlw $4, %zmm1, %zmm5
+; AVX512VLBW-NEXT: vpsrlw $4, %zmm1, %zmm3
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpsllw $5, %zmm2, %zmm4
+; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLBW-NEXT: vpblendmb %zmm3, %zmm1, %zmm3 {%k1}
+; AVX512VLBW-NEXT: vpsrlw $2, %zmm3, %zmm5
; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VLBW-NEXT: vpblendmb %zmm5, %zmm1, %zmm5 {%k2}
-; AVX512VLBW-NEXT: vpsrlw $2, %zmm5, %zmm7
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm7, %zmm7
-; AVX512VLBW-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1}
-; AVX512VLBW-NEXT: vpsrlw $1, %zmm5, %zmm7
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm7, %zmm7
-; AVX512VLBW-NEXT: vpaddb %zmm6, %zmm6, %zmm6
-; AVX512VLBW-NEXT: vpmovb2m %zmm6, %k1
-; AVX512VLBW-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1}
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %zmm4, %zmm6, %zmm4
+; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VLBW-NEXT: vpsrlw $1, %zmm3, %zmm5
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT: vpsubb %zmm2, %zmm4, %zmm4
; AVX512VLBW-NEXT: vpsllw $5, %zmm4, %zmm4
-; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm6
-; AVX512VLBW-NEXT: vpmovb2m %zmm6, %k1
+; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm5
+; AVX512VLBW-NEXT: vpmovb2m %zmm5, %k1
; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k2
; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm4
; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
@@ -501,38 +488,37 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
; AVX512VLBW-NEXT: vpsllw $2, %zmm0, %zmm4
; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
; AVX512VLBW-NEXT: vmovdqu8 %zmm4, %zmm0 {%k1}
-; AVX512VLBW-NEXT: vpaddb %zmm6, %zmm6, %zmm4
+; AVX512VLBW-NEXT: vpaddb %zmm5, %zmm5, %zmm4
; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
; AVX512VLBW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
-; AVX512VLBW-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v64i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm4, %zmm5
-; AVX512VLVBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm6
-; AVX512VLVBMI2-NEXT: vpmovb2m %zmm6, %k1
-; AVX512VLVBMI2-NEXT: vpmovb2m %zmm5, %k2
-; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm1, %zmm5
+; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm1, %zmm3
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm2, %zmm4
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLVBMI2-NEXT: vpblendmb %zmm3, %zmm1, %zmm3 {%k1}
+; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm3, %zmm5
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm3, %zmm5
; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VLVBMI2-NEXT: vpblendmb %zmm5, %zmm1, %zmm5 {%k2}
-; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm5, %zmm7
-; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm7, %zmm7
-; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1}
-; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm5, %zmm7
-; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm7, %zmm7
-; AVX512VLVBMI2-NEXT: vpaddb %zmm6, %zmm6, %zmm6
-; AVX512VLVBMI2-NEXT: vpmovb2m %zmm6, %k1
-; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1}
-; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %zmm4, %zmm6, %zmm4
+; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT: vpsubb %zmm2, %zmm4, %zmm4
; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm4, %zmm4
-; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm6
-; AVX512VLVBMI2-NEXT: vpmovb2m %zmm6, %k1
+; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm5
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm5, %k1
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k2
; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm4
; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
@@ -540,11 +526,11 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm0, %zmm4
; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm4, %zmm0 {%k1}
-; AVX512VLVBMI2-NEXT: vpaddb %zmm6, %zmm6, %zmm4
+; AVX512VLVBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm4
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
; AVX512VLVBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
-; AVX512VLVBMI2-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512VLVBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512VLVBMI2-NEXT: retq
%res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
@@ -559,42 +545,39 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %
; AVX512F-LABEL: splatvar_funnnel_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastq %xmm2, %zmm2
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512F-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT: vpsrlq %xmm4, %zmm1, %zmm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq %xmm2, %zmm1, %zmm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpsllq %xmm4, %zmm0, %zmm0
-; AVX512F-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v8i64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastq %xmm2, %zmm2
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512VL-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VL-NEXT: vpsrlq %xmm4, %zmm1, %zmm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512VL-NEXT: vpsrlq %xmm2, %zmm1, %zmm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpsllq %xmm4, %zmm0, %zmm0
-; AVX512VL-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512VL-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v8i64:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastq %xmm2, %zmm2
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT: vpsrlq %xmm4, %zmm1, %zmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512BW-NEXT: vpsrlq %xmm2, %zmm1, %zmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpsllq %xmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: retq
;
@@ -608,14 +591,13 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %
; AVX512VLBW-LABEL: splatvar_funnnel_v8i64:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %zmm2
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vpsrlq %xmm4, %zmm1, %zmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpsrlq %xmm2, %zmm1, %zmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpsllq %xmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -634,48 +616,45 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
; AVX512F-LABEL: splatvar_funnnel_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastd %xmm2, %zmm2
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512F-NEXT: vpandd %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512F-NEXT: vpsrld %xmm5, %zmm1, %zmm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512F-NEXT: vpsrld %xmm3, %zmm1, %zmm3
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512F-NEXT: vpslld %xmm4, %zmm0, %zmm0
-; AVX512F-NEXT: vpord %zmm5, %zmm0, %zmm0
-; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpord %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v16i32:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastd %xmm2, %zmm2
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512VL-NEXT: vpandd %zmm3, %zmm2, %zmm4
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VL-NEXT: vpsrld %xmm5, %zmm1, %zmm5
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VL-NEXT: vpsrld %xmm3, %zmm1, %zmm3
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512VL-NEXT: vpslld %xmm4, %zmm0, %zmm0
-; AVX512VL-NEXT: vpord %zmm5, %zmm0, %zmm0
-; AVX512VL-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512VL-NEXT: vpord %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512VL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v16i32:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastd %xmm2, %zmm2
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512BW-NEXT: vpandd %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512BW-NEXT: vpsrld %xmm5, %zmm1, %zmm5
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512BW-NEXT: vpsrld %xmm3, %zmm1, %zmm3
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512BW-NEXT: vpslld %xmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpord %zmm5, %zmm0, %zmm0
-; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpord %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: retq
;
@@ -689,16 +668,15 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
; AVX512VLBW-LABEL: splatvar_funnnel_v16i32:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %zmm2
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512VLBW-NEXT: vpandd %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VLBW-NEXT: vpsrld %xmm5, %zmm1, %zmm5
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VLBW-NEXT: vpsrld %xmm3, %zmm1, %zmm3
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512VLBW-NEXT: vpslld %xmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vpord %zmm5, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT: vpord %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -757,16 +735,15 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
; AVX512BW-LABEL: splatvar_funnnel_v32i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastw %xmm2, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512BW-NEXT: vpsrlw %xmm5, %zmm1, %zmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512BW-NEXT: vpsrlw %xmm3, %zmm1, %zmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512BW-NEXT: vpsllw %xmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: retq
;
@@ -780,16 +757,15 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
; AVX512VLBW-LABEL: splatvar_funnnel_v32i16:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %zmm2
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VLBW-NEXT: vpsrlw %xmm5, %zmm1, %zmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm1, %zmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512VLBW-NEXT: vpsllw %xmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -868,96 +844,92 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
; AVX512BW-LABEL: splatvar_funnnel_v64i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpsrlw %xmm5, %zmm1, %zmm6
-; AVX512BW-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7
-; AVX512BW-NEXT: vpsrlw %xmm5, %zmm7, %zmm5
-; AVX512BW-NEXT: vpsrlw $8, %zmm5, %zmm5
-; AVX512BW-NEXT: vpbroadcastb %xmm5, %zmm5
-; AVX512BW-NEXT: vpandq %zmm5, %zmm6, %zmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpsrlw %xmm3, %zmm1, %zmm4
+; AVX512BW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5
+; AVX512BW-NEXT: vpsrlw %xmm3, %zmm5, %zmm3
+; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
+; AVX512BW-NEXT: vpbroadcastb %xmm3, %zmm3
+; AVX512BW-NEXT: vpandq %zmm3, %zmm4, %zmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpsllw %xmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsllw %xmm4, %zmm7, %zmm4
+; AVX512BW-NEXT: vpsllw %xmm4, %zmm5, %zmm4
; AVX512BW-NEXT: vpbroadcastb %xmm4, %zmm4
; AVX512BW-NEXT: vpandq %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512BW-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8:
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %zmm2
-; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VBMI2-NEXT: vpsrlw %xmm5, %zmm1, %zmm6
-; AVX512VBMI2-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7
-; AVX512VBMI2-NEXT: vpsrlw %xmm5, %zmm7, %zmm5
-; AVX512VBMI2-NEXT: vpsrlw $8, %zmm5, %zmm5
-; AVX512VBMI2-NEXT: vpbroadcastb %xmm5, %zmm5
-; AVX512VBMI2-NEXT: vpandq %zmm5, %zmm6, %zmm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpsrlw %xmm3, %zmm1, %zmm4
+; AVX512VBMI2-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5
+; AVX512VBMI2-NEXT: vpsrlw %xmm3, %zmm5, %zmm3
+; AVX512VBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpbroadcastb %xmm3, %zmm3
+; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VBMI2-NEXT: vpsllw %xmm4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT: vpsllw %xmm4, %zmm7, %zmm4
+; AVX512VBMI2-NEXT: vpsllw %xmm4, %zmm5, %zmm4
; AVX512VBMI2-NEXT: vpbroadcastb %xmm4, %zmm4
; AVX512VBMI2-NEXT: vpandq %zmm4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512VBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: splatvar_funnnel_v64i8:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLBW-NEXT: vpsrlw %xmm5, %zmm1, %zmm6
-; AVX512VLBW-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7
-; AVX512VLBW-NEXT: vpsrlw %xmm5, %zmm7, %zmm5
-; AVX512VLBW-NEXT: vpsrlw $8, %zmm5, %zmm5
-; AVX512VLBW-NEXT: vpbroadcastb %xmm5, %zmm5
-; AVX512VLBW-NEXT: vpandq %zmm5, %zmm6, %zmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm1, %zmm4
+; AVX512VLBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5
+; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm5, %zmm3
+; AVX512VLBW-NEXT: vpsrlw $8, %zmm3, %zmm3
+; AVX512VLBW-NEXT: vpbroadcastb %xmm3, %zmm3
+; AVX512VLBW-NEXT: vpandq %zmm3, %zmm4, %zmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLBW-NEXT: vpsllw %xmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vpsllw %xmm4, %zmm7, %zmm4
+; AVX512VLBW-NEXT: vpsllw %xmm4, %zmm5, %zmm4
; AVX512VLBW-NEXT: vpbroadcastb %xmm4, %zmm4
; AVX512VLBW-NEXT: vpandq %zmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v64i8:
; AVX512VLVBMI2: # %bb.0:
; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %zmm2
-; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLVBMI2-NEXT: vpsrlw %xmm5, %zmm1, %zmm6
-; AVX512VLVBMI2-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7
-; AVX512VLVBMI2-NEXT: vpsrlw %xmm5, %zmm7, %zmm5
-; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm5, %zmm5
-; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm5, %zmm5
-; AVX512VLVBMI2-NEXT: vpandq %zmm5, %zmm6, %zmm5
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %zmm1, %zmm4
+; AVX512VLVBMI2-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %zmm5, %zmm3
+; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %zmm7, %zmm4
+; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %zmm5, %zmm4
; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm4, %zmm4
; AVX512VLVBMI2-NEXT: vpandq %zmm4, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512VLVBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512VLVBMI2-NEXT: retq
%splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll
index f7097133268..d5b7b7152aa 100644
--- a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll
+++ b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll
@@ -359,16 +359,15 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
;
; AVX512BW-LABEL: testv32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm2
-; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm3
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vptestnmb %zmm4, %zmm3, %k0
-; AVX512BW-NEXT: vpmovm2b %k0, %zmm5
-; AVX512BW-NEXT: vpandq %zmm5, %zmm2, %zmm2
-; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm3
-; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1
-; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm2
+; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm1
+; AVX512BW-NEXT: vpandq %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT: vpaddb %zmm3, %zmm1, %zmm1
; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
@@ -442,16 +441,15 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
;
; AVX512BW-LABEL: testv32i16u:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm2
-; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm3
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vptestnmb %zmm4, %zmm3, %k0
-; AVX512BW-NEXT: vpmovm2b %k0, %zmm5
-; AVX512BW-NEXT: vpandq %zmm5, %zmm2, %zmm2
-; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm3
-; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1
-; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm2
+; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm1
+; AVX512BW-NEXT: vpandq %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT: vpaddb %zmm3, %zmm1, %zmm1
; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
@@ -549,16 +547,15 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
;
; AVX512BW-LABEL: testv64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm2
-; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vptestnmb %zmm3, %zmm0, %k0
-; AVX512BW-NEXT: vpmovm2b %k0, %zmm4
-; AVX512BW-NEXT: vpandq %zmm4, %zmm2, %zmm2
-; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm1
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: testv64i8:
@@ -640,16 +637,15 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
;
; AVX512BW-LABEL: testv64i8u:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm2
-; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vptestnmb %zmm3, %zmm0, %k0
-; AVX512BW-NEXT: vpmovm2b %k0, %zmm4
-; AVX512BW-NEXT: vpandq %zmm4, %zmm2, %zmm2
-; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm1
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: testv64i8u:
OpenPOWER on IntegriCloud