summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCraig Topper <craig.topper@intel.com>2018-01-27 20:19:09 +0000
committerCraig Topper <craig.topper@intel.com>2018-01-27 20:19:09 +0000
commit247016a735bd3264e07c6198dd3a7c419e6eeaee (patch)
treee6699e05b9802670c7aa93f9f201ed534ae4dc02
parent513d3fa674c621d864383ffa6418878118d48791 (diff)
downloadbcm5719-llvm-247016a735bd3264e07c6198dd3a7c419e6eeaee.tar.gz
bcm5719-llvm-247016a735bd3264e07c6198dd3a7c419e6eeaee.zip
[X86] Use vptestm/vptestnm for comparisons with zero to avoid creating a zero vector.
We can use the same input for both operands to get a free compare with zero. We already use this trick in a couple places where we explicitly create PTESTM with the same input twice. This generalizes it. I'm hoping to remove the ISD opcodes and move this to isel patterns like we do for scalar cmp/test. llvm-svn: 323605
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp7
-rw-r--r--llvm/test/CodeGen/X86/avx512-arith.ll84
-rw-r--r--llvm/test/CodeGen/X86/avx512-mask-op.ll15
-rw-r--r--llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll20
-rw-r--r--llvm/test/CodeGen/X86/avx512-mov.ll24
-rwxr-xr-xllvm/test/CodeGen/X86/avx512-schedule.ll228
-rwxr-xr-xllvm/test/CodeGen/X86/avx512-shuffle-schedule.ll4128
-rw-r--r--llvm/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-int.ll552
-rw-r--r--llvm/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll288
-rw-r--r--llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll432
-rw-r--r--llvm/test/CodeGen/X86/avx512-shuffles/permute.ll336
-rw-r--r--llvm/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll192
-rw-r--r--llvm/test/CodeGen/X86/avx512-shuffles/shuffle.ll576
-rw-r--r--llvm/test/CodeGen/X86/avx512-vbroadcast.ll24
-rw-r--r--llvm/test/CodeGen/X86/avx512-vec-cmp.ll27
-rw-r--r--llvm/test/CodeGen/X86/avx512bw-mov.ll12
-rw-r--r--llvm/test/CodeGen/X86/avx512bwvl-mov.ll24
-rwxr-xr-xllvm/test/CodeGen/X86/avx512vl-arith.ll138
-rw-r--r--llvm/test/CodeGen/X86/avx512vl-mov.ll84
-rw-r--r--llvm/test/CodeGen/X86/avx512vl-vbroadcast.ll21
-rw-r--r--llvm/test/CodeGen/X86/compress_expand.ll35
-rw-r--r--llvm/test/CodeGen/X86/masked_gather_scatter.ll44
-rw-r--r--llvm/test/CodeGen/X86/masked_memop.ll72
-rw-r--r--llvm/test/CodeGen/X86/nontemporal-loads.ll3
-rw-r--r--llvm/test/CodeGen/X86/pr35272.ll3
-rw-r--r--llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll92
-rw-r--r--llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll47
-rw-r--r--llvm/test/CodeGen/X86/vector-lzcnt-512.ll6
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll3
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-v1.ll36
30 files changed, 2560 insertions, 4993 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 912ae31aaf3..776632551d5 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -17777,6 +17777,13 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
return DAG.getNode(SSECC == 0 ? X86ISD::TESTNM : X86ISD::TESTM,
dl, VT, RHS, LHS);
}
+
+ // If this is just a comparison with 0 without an AND, we can just use
+ // the same input twice to avoid creating a zero vector.
+ if (ISD::isBuildVectorAllZeros(Op1.getNode())) {
+ return DAG.getNode(SSECC == 0 ? X86ISD::TESTNM : X86ISD::TESTM,
+ dl, VT, Op0, Op0);
+ }
}
unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode) ? X86ISD::CMPMU
diff --git a/llvm/test/CodeGen/X86/avx512-arith.ll b/llvm/test/CodeGen/X86/avx512-arith.ll
index 766238f3280..7c8a18ad782 100644
--- a/llvm/test/CodeGen/X86/avx512-arith.ll
+++ b/llvm/test/CodeGen/X86/avx512-arith.ll
@@ -401,8 +401,7 @@ define <16 x i32> @vpaddd_broadcast_test(<16 x i32> %i) nounwind {
define <16 x i32> @vpaddd_mask_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd_mask_test:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestmd %zmm2, %zmm2, %k1
; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -414,8 +413,7 @@ define <16 x i32> @vpaddd_mask_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %ma
define <16 x i32> @vpaddd_maskz_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd_maskz_test:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestmd %zmm2, %zmm2, %k1
; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -427,8 +425,7 @@ define <16 x i32> @vpaddd_maskz_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %m
define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd_mask_fold_test:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -441,8 +438,7 @@ define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16
define <16 x i32> @vpaddd_mask_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd_mask_broadcast_test:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -454,8 +450,7 @@ define <16 x i32> @vpaddd_mask_broadcast_test(<16 x i32> %i, <16 x i32> %mask1)
define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd_maskz_fold_test:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -468,8 +463,7 @@ define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16
define <16 x i32> @vpaddd_maskz_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd_maskz_broadcast_test:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -671,8 +665,7 @@ entry:
define <16 x float> @test_mask_vaddps(<16 x float> %dst, <16 x float> %i,
; CHECK-LABEL: test_mask_vaddps:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT: vptestmd %zmm3, %zmm3, %k1
; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
<16 x float> %j, <16 x i32> %mask1)
@@ -686,8 +679,7 @@ define <16 x float> @test_mask_vaddps(<16 x float> %dst, <16 x float> %i,
define <16 x float> @test_mask_vmulps(<16 x float> %dst, <16 x float> %i,
; CHECK-LABEL: test_mask_vmulps:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT: vptestmd %zmm3, %zmm3, %k1
; CHECK-NEXT: vmulps %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
<16 x float> %j, <16 x i32> %mask1)
@@ -701,8 +693,7 @@ define <16 x float> @test_mask_vmulps(<16 x float> %dst, <16 x float> %i,
define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i,
; CHECK-LABEL: test_mask_vminps:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT: vptestmd %zmm3, %zmm3, %k1
; CHECK-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
<16 x float> %j, <16 x i32> %mask1)
@@ -718,38 +709,33 @@ define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i,
; AVX512F-LABEL: test_mask_vminpd:
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def %ymm3 killed %ymm3 def %zmm3
-; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX512F-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
+; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1
; AVX512F-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_mask_vminpd:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT: vpcmpneqd %ymm4, %ymm3, %k1
+; AVX512VL-NEXT: vptestmd %ymm3, %ymm3, %k1
; AVX512VL-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: test_mask_vminpd:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def %ymm3 killed %ymm3 def %zmm3
-; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX512BW-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
+; AVX512BW-NEXT: vptestmd %zmm3, %zmm3, %k1
; AVX512BW-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test_mask_vminpd:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: # kill: def %ymm3 killed %ymm3 def %zmm3
-; AVX512DQ-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX512DQ-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
+; AVX512DQ-NEXT: vptestmd %zmm3, %zmm3, %k1
; AVX512DQ-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
; AVX512DQ-NEXT: retq
;
; SKX-LABEL: test_mask_vminpd:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; SKX-NEXT: vpcmpneqd %ymm4, %ymm3, %k1
+; SKX-NEXT: vptestmd %ymm3, %ymm3, %k1
; SKX-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
; SKX-NEXT: retq
<8 x double> %j, <8 x i32> %mask1)
@@ -764,8 +750,7 @@ define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i,
define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i,
; CHECK-LABEL: test_mask_vmaxps:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT: vptestmd %zmm3, %zmm3, %k1
; CHECK-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
<16 x float> %j, <16 x i32> %mask1)
@@ -781,38 +766,33 @@ define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i,
; AVX512F-LABEL: test_mask_vmaxpd:
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def %ymm3 killed %ymm3 def %zmm3
-; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX512F-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
+; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1
; AVX512F-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_mask_vmaxpd:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT: vpcmpneqd %ymm4, %ymm3, %k1
+; AVX512VL-NEXT: vptestmd %ymm3, %ymm3, %k1
; AVX512VL-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: test_mask_vmaxpd:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def %ymm3 killed %ymm3 def %zmm3
-; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX512BW-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
+; AVX512BW-NEXT: vptestmd %zmm3, %zmm3, %k1
; AVX512BW-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test_mask_vmaxpd:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: # kill: def %ymm3 killed %ymm3 def %zmm3
-; AVX512DQ-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX512DQ-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
+; AVX512DQ-NEXT: vptestmd %zmm3, %zmm3, %k1
; AVX512DQ-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
; AVX512DQ-NEXT: retq
;
; SKX-LABEL: test_mask_vmaxpd:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; SKX-NEXT: vpcmpneqd %ymm4, %ymm3, %k1
+; SKX-NEXT: vptestmd %ymm3, %ymm3, %k1
; SKX-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
; SKX-NEXT: retq
<8 x double> %j, <8 x i32> %mask1)
@@ -827,8 +807,7 @@ define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i,
define <16 x float> @test_mask_vsubps(<16 x float> %dst, <16 x float> %i,
; CHECK-LABEL: test_mask_vsubps:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT: vptestmd %zmm3, %zmm3, %k1
; CHECK-NEXT: vsubps %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
<16 x float> %j, <16 x i32> %mask1)
@@ -842,8 +821,7 @@ define <16 x float> @test_mask_vsubps(<16 x float> %dst, <16 x float> %i,
define <16 x float> @test_mask_vdivps(<16 x float> %dst, <16 x float> %i,
; CHECK-LABEL: test_mask_vdivps:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT: vptestmd %zmm3, %zmm3, %k1
; CHECK-NEXT: vdivps %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
<16 x float> %j, <16 x i32> %mask1)
@@ -857,8 +835,7 @@ define <16 x float> @test_mask_vdivps(<16 x float> %dst, <16 x float> %i,
define <8 x double> @test_mask_vaddpd(<8 x double> %dst, <8 x double> %i,
; CHECK-LABEL: test_mask_vaddpd:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpneqq %zmm4, %zmm3, %k1
+; CHECK-NEXT: vptestmq %zmm3, %zmm3, %k1
; CHECK-NEXT: vaddpd %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
<8 x double> %j, <8 x i64> %mask1)
@@ -872,8 +849,7 @@ define <8 x double> @test_mask_vaddpd(<8 x double> %dst, <8 x double> %i,
define <8 x double> @test_maskz_vaddpd(<8 x double> %i, <8 x double> %j,
; CHECK-LABEL: test_maskz_vaddpd:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpneqq %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestmq %zmm2, %zmm2, %k1
; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
<8 x i64> %mask1) nounwind readnone {
@@ -886,8 +862,7 @@ define <8 x double> @test_maskz_vaddpd(<8 x double> %i, <8 x double> %j,
define <8 x double> @test_mask_fold_vaddpd(<8 x double> %dst, <8 x double> %i,
; CHECK-LABEL: test_mask_fold_vaddpd:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpneqq %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestmq %zmm2, %zmm2, %k1
; CHECK-NEXT: vaddpd (%rdi), %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
<8 x double>* %j, <8 x i64> %mask1)
@@ -902,8 +877,7 @@ define <8 x double> @test_mask_fold_vaddpd(<8 x double> %dst, <8 x double> %i,
define <8 x double> @test_maskz_fold_vaddpd(<8 x double> %i, <8 x double>* %j,
; CHECK-LABEL: test_maskz_fold_vaddpd:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpneqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vaddpd (%rdi), %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
<8 x i64> %mask1) nounwind {
@@ -930,8 +904,7 @@ define <8 x double> @test_broadcast_vaddpd(<8 x double> %i, double* %j) nounwind
define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double> %i,
; CHECK-LABEL: test_mask_broadcast_vaddpd:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpcmpneqq %zmm0, %zmm2, %k1
+; CHECK-NEXT: vptestmq %zmm2, %zmm2, %k1
; CHECK-NEXT: vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1}
; CHECK-NEXT: vmovapd %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -949,8 +922,7 @@ define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double>
define <8 x double> @test_maskz_broadcast_vaddpd(<8 x double> %i, double* %j,
; CHECK-LABEL: test_maskz_broadcast_vaddpd:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpneqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vaddpd (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
<8 x i64> %mask1) nounwind {
diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll
index c5efcb4f358..f4425b7583d 100644
--- a/llvm/test/CodeGen/X86/avx512-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll
@@ -2413,8 +2413,7 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) {
define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) {
; KNL-LABEL: test_bitcast_v8i1_zext:
; KNL: ## %bb.0:
-; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; KNL-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; KNL-NEXT: vptestnmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movzbl %al, %eax
; KNL-NEXT: addl %eax, %eax
@@ -2423,8 +2422,7 @@ define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) {
;
; SKX-LABEL: test_bitcast_v8i1_zext:
; SKX: ## %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k0
; SKX-NEXT: kmovb %k0, %eax
; SKX-NEXT: addl %eax, %eax
; SKX-NEXT: vzeroupper
@@ -2432,8 +2430,7 @@ define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) {
;
; AVX512BW-LABEL: test_bitcast_v8i1_zext:
; AVX512BW: ## %bb.0:
-; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: movzbl %al, %eax
; AVX512BW-NEXT: addl %eax, %eax
@@ -2442,8 +2439,7 @@ define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) {
;
; AVX512DQ-LABEL: test_bitcast_v8i1_zext:
; AVX512DQ: ## %bb.0:
-; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512DQ-NEXT: kmovb %k0, %eax
; AVX512DQ-NEXT: addl %eax, %eax
; AVX512DQ-NEXT: vzeroupper
@@ -2459,8 +2455,7 @@ define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) {
define i32 @test_bitcast_v16i1_zext(<16 x i32> %a) {
; CHECK-LABEL: test_bitcast_v16i1_zext:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: addl %eax, %eax
; CHECK-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll b/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll
index 083427c76da..275884c6de0 100644
--- a/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll
+++ b/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll
@@ -5,8 +5,7 @@
define <16 x i32> @test1(<16 x i32> %trigger, <16 x i32>* %addr) {
; AVX512-LABEL: test1:
; AVX512: ## %bb.0:
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1
; AVX512-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
; AVX512-NEXT: retq
%mask = icmp eq <16 x i32> %trigger, zeroinitializer
@@ -17,8 +16,7 @@ define <16 x i32> @test1(<16 x i32> %trigger, <16 x i32>* %addr) {
define <16 x i32> @test2(<16 x i32> %trigger, <16 x i32>* %addr) {
; AVX512-LABEL: test2:
; AVX512: ## %bb.0:
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1
; AVX512-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
; AVX512-NEXT: retq
%mask = icmp eq <16 x i32> %trigger, zeroinitializer
@@ -29,8 +27,7 @@ define <16 x i32> @test2(<16 x i32> %trigger, <16 x i32>* %addr) {
define void @test3(<16 x i32> %trigger, <16 x i32>* %addr, <16 x i32> %val) {
; AVX512-LABEL: test3:
; AVX512: ## %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
+; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1
; AVX512-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1}
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -42,8 +39,7 @@ define void @test3(<16 x i32> %trigger, <16 x i32>* %addr, <16 x i32> %val) {
define <16 x float> @test4(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %dst) {
; AVX512-LABEL: test4:
; AVX512: ## %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
+; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1
; AVX512-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1}
; AVX512-NEXT: retq
%mask = icmp eq <16 x i32> %trigger, zeroinitializer
@@ -54,8 +50,7 @@ define <16 x float> @test4(<16 x i32> %trigger, <16 x float>* %addr, <16 x float
define void @test13(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %val) {
; AVX512-LABEL: test13:
; AVX512: ## %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
+; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1
; AVX512-NEXT: vmovups %zmm1, (%rdi) {%k1}
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -99,9 +94,8 @@ declare <16 x i32*> @llvm.masked.load.v16p0i32.p0v16p0i32(<16 x i32*>*, i32, <16
define <16 x i32*> @test23(<16 x i32*> %trigger, <16 x i32*>* %addr) {
; AVX512-LABEL: test23:
; AVX512: ## %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
-; AVX512-NEXT: vpcmpeqq %zmm2, %zmm1, %k2
+; AVX512-NEXT: vptestnmq %zmm0, %zmm0, %k1
+; AVX512-NEXT: vptestnmq %zmm1, %zmm1, %k2
; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k2} {z}
; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z}
; AVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/avx512-mov.ll b/llvm/test/CodeGen/X86/avx512-mov.ll
index f1a2ac880ed..f7ab9e24d72 100644
--- a/llvm/test/CodeGen/X86/avx512-mov.ll
+++ b/llvm/test/CodeGen/X86/avx512-mov.ll
@@ -311,8 +311,7 @@ define <16 x float> @test31(i8 * %addr) {
define <16 x i32> @test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
; CHECK-LABEL: test32:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x48,0x1f,0xca,0x04]
+; CHECK-NEXT: vptestmd %zmm1, %zmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x48,0x27,0xc9]
; CHECK-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -325,8 +324,7 @@ define <16 x i32> @test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
define <16 x i32> @test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
; CHECK-LABEL: test33:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x48,0x1f,0xca,0x04]
+; CHECK-NEXT: vptestmd %zmm1, %zmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x48,0x27,0xc9]
; CHECK-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -339,8 +337,7 @@ define <16 x i32> @test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
define <16 x i32> @test34(i8 * %addr, <16 x i32> %mask1) {
; CHECK-LABEL: test34:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc9,0x04]
+; CHECK-NEXT: vptestmd %zmm0, %zmm0, %k1 ## encoding: [0x62,0xf2,0x7d,0x48,0x27,0xc8]
; CHECK-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -353,8 +350,7 @@ define <16 x i32> @test34(i8 * %addr, <16 x i32> %mask1) {
define <16 x i32> @test35(i8 * %addr, <16 x i32> %mask1) {
; CHECK-LABEL: test35:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc9,0x04]
+; CHECK-NEXT: vptestmd %zmm0, %zmm0, %k1 ## encoding: [0x62,0xf2,0x7d,0x48,0x27,0xc8]
; CHECK-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xc9,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -367,8 +363,7 @@ define <16 x i32> @test35(i8 * %addr, <16 x i32> %mask1) {
define <8 x i64> @test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
; CHECK-LABEL: test36:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x48,0x1f,0xca,0x04]
+; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x48,0x27,0xc9]
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i64> %mask1, zeroinitializer
@@ -381,8 +376,7 @@ define <8 x i64> @test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
define <8 x i64> @test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
; CHECK-LABEL: test37:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x48,0x1f,0xca,0x04]
+; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x48,0x27,0xc9]
; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x49,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i64> %mask1, zeroinitializer
@@ -395,8 +389,7 @@ define <8 x i64> @test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
define <8 x i64> @test38(i8 * %addr, <8 x i64> %mask1) {
; CHECK-LABEL: test38:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xc9,0x04]
+; CHECK-NEXT: vptestmq %zmm0, %zmm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc8]
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i64> %mask1, zeroinitializer
@@ -409,8 +402,7 @@ define <8 x i64> @test38(i8 * %addr, <8 x i64> %mask1) {
define <8 x i64> @test39(i8 * %addr, <8 x i64> %mask1) {
; CHECK-LABEL: test39:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xc9,0x04]
+; CHECK-NEXT: vptestmq %zmm0, %zmm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc8]
; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfe,0xc9,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i64> %mask1, zeroinitializer
diff --git a/llvm/test/CodeGen/X86/avx512-schedule.ll b/llvm/test/CodeGen/X86/avx512-schedule.ll
index a7969251940..1f9fa5204d1 100755
--- a/llvm/test/CodeGen/X86/avx512-schedule.ll
+++ b/llvm/test/CodeGen/X86/avx512-schedule.ll
@@ -400,15 +400,13 @@ define <16 x i32> @vpaddd_broadcast_test(<16 x i32> %i) nounwind {
define <16 x i32> @vpaddd_mask_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone {
; GENERIC-LABEL: vpaddd_mask_test:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} # sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: vpaddd_mask_test:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -420,15 +418,13 @@ define <16 x i32> @vpaddd_mask_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %ma
define <16 x i32> @vpaddd_maskz_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone {
; GENERIC-LABEL: vpaddd_maskz_test:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: vpaddd_maskz_test:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -440,15 +436,13 @@ define <16 x i32> @vpaddd_maskz_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %m
define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone {
; GENERIC-LABEL: vpaddd_mask_fold_test:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} # sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: vpaddd_mask_fold_test:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} # sched: [8:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -461,15 +455,13 @@ define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16
define <16 x i32> @vpaddd_mask_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
; GENERIC-LABEL: vpaddd_mask_broadcast_test:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} # sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: vpaddd_mask_broadcast_test:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} # sched: [8:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -481,15 +473,13 @@ define <16 x i32> @vpaddd_mask_broadcast_test(<16 x i32> %i, <16 x i32> %mask1)
define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone {
; GENERIC-LABEL: vpaddd_maskz_fold_test:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: vpaddd_maskz_fold_test:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [8:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -502,15 +492,13 @@ define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16
define <16 x i32> @vpaddd_maskz_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
; GENERIC-LABEL: vpaddd_maskz_broadcast_test:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} {z} # sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: vpaddd_maskz_broadcast_test:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} {z} # sched: [8:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -717,15 +705,13 @@ entry:
define <16 x float> @test_mask_vaddps(<16 x float> %dst, <16 x float> %i,
; GENERIC-LABEL: test_mask_vaddps:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vaddps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_mask_vaddps:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vaddps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
<16 x float> %j, <16 x i32> %mask1)
@@ -739,15 +725,13 @@ define <16 x float> @test_mask_vaddps(<16 x float> %dst, <16 x float> %i,
define <16 x float> @test_mask_vmulps(<16 x float> %dst, <16 x float> %i, <16 x float> %j, <16 x i32> %mask1) nounwind readnone {
; GENERIC-LABEL: test_mask_vmulps:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vmulps %zmm2, %zmm1, %zmm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_mask_vmulps:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vmulps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -759,15 +743,13 @@ define <16 x float> @test_mask_vmulps(<16 x float> %dst, <16 x float> %i, <16 x
define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i, <16 x float> %j, <16 x i32> %mask1) nounwind readnone {
; GENERIC-LABEL: test_mask_vminps:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_mask_vminps:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -780,15 +762,13 @@ define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i, <16 x
define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i, <8 x double> %j, <8 x i32> %mask1) nounwind readnone {
; GENERIC-LABEL: test_mask_vminpd:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_mask_vminpd:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -801,15 +781,13 @@ define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i, <8 x d
define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i, <16 x float> %j, <16 x i32> %mask1) nounwind readnone {
; GENERIC-LABEL: test_mask_vmaxps:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_mask_vmaxps:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -822,15 +800,13 @@ define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i, <16 x
define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i, <8 x double> %j, <8 x i32> %mask1) nounwind readnone {
; GENERIC-LABEL: test_mask_vmaxpd:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_mask_vmaxpd:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -843,15 +819,13 @@ define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i, <8 x d
define <16 x float> @test_mask_vsubps(<16 x float> %dst, <16 x float> %i, <16 x float> %j, <16 x i32> %mask1) nounwind readnone {
; GENERIC-LABEL: test_mask_vsubps:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vsubps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_mask_vsubps:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vsubps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -863,15 +837,13 @@ define <16 x float> @test_mask_vsubps(<16 x float> %dst, <16 x float> %i, <16 x
define <16 x float> @test_mask_vdivps(<16 x float> %dst, <16 x float> %i, <16 x float> %j, <16 x i32> %mask1) nounwind readnone {
; GENERIC-LABEL: test_mask_vdivps:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vdivps %zmm2, %zmm1, %zmm0 {%k1} # sched: [24:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_mask_vdivps:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vdivps %zmm2, %zmm1, %zmm0 {%k1} # sched: [23:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -883,15 +855,13 @@ define <16 x float> @test_mask_vdivps(<16 x float> %dst, <16 x float> %i, <16 x
define <8 x double> @test_mask_vaddpd(<8 x double> %dst, <8 x double> %i, <8 x double> %j, <8 x i64> %mask1) nounwind readnone {
; GENERIC-LABEL: test_mask_vaddpd:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vaddpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_mask_vaddpd:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vaddpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%mask = icmp ne <8 x i64> %mask1, zeroinitializer
@@ -903,15 +873,13 @@ define <8 x double> @test_mask_vaddpd(<8 x double> %dst, <8 x double> %i, <8 x d
define <8 x double> @test_maskz_vaddpd(<8 x double> %i, <8 x double> %j, <8 x i64> %mask1) nounwind readnone {
; GENERIC-LABEL: test_maskz_vaddpd:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vaddpd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_maskz_vaddpd:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vaddpd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%mask = icmp ne <8 x i64> %mask1, zeroinitializer
@@ -923,15 +891,13 @@ define <8 x double> @test_maskz_vaddpd(<8 x double> %i, <8 x double> %j, <8 x i6
define <8 x double> @test_mask_fold_vaddpd(<8 x double> %dst, <8 x double> %i, <8 x double>* %j, <8 x i64> %mask1) nounwind {
; GENERIC-LABEL: test_mask_fold_vaddpd:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vaddpd (%rdi), %zmm1, %zmm0 {%k1} # sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_mask_fold_vaddpd:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vaddpd (%rdi), %zmm1, %zmm0 {%k1} # sched: [11:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%mask = icmp ne <8 x i64> %mask1, zeroinitializer
@@ -944,15 +910,13 @@ define <8 x double> @test_mask_fold_vaddpd(<8 x double> %dst, <8 x double> %i, <
define <8 x double> @test_maskz_fold_vaddpd(<8 x double> %i, <8 x double>* %j, <8 x i64> %mask1) nounwind {
; GENERIC-LABEL: test_maskz_fold_vaddpd:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vaddpd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_maskz_fold_vaddpd:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vaddpd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [11:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%mask = icmp ne <8 x i64> %mask1, zeroinitializer
@@ -983,16 +947,14 @@ define <8 x double> @test_broadcast_vaddpd(<8 x double> %i, double* %j) nounwind
define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double> %i, double* %j, <8 x i64> %mask1) nounwind {
; GENERIC-LABEL: test_mask_broadcast_vaddpd:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm0, %xmm0, %xmm0 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqq %zmm0, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1} # sched: [7:1.00]
; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_mask_broadcast_vaddpd:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqq %zmm0, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1} # sched: [11:0.50]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -1009,15 +971,13 @@ define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double>
define <8 x double> @test_maskz_broadcast_vaddpd(<8 x double> %i, double* %j,
; GENERIC-LABEL: test_maskz_broadcast_vaddpd:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vaddpd (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} # sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_maskz_broadcast_vaddpd:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vaddpd (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} # sched: [11:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
<8 x i64> %mask1) nounwind {
@@ -6383,15 +6343,13 @@ define <16 x float> @mov_test31(i8 * %addr) {
define <16 x i32> @mov_test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
; GENERIC-LABEL: mov_test32:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} # sched: [4:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: mov_test32:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} # sched: [8:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -6404,15 +6362,13 @@ define <16 x i32> @mov_test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
define <16 x i32> @mov_test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
; GENERIC-LABEL: mov_test33:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} # sched: [4:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: mov_test33:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} # sched: [8:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -6425,15 +6381,13 @@ define <16 x i32> @mov_test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
define <16 x i32> @mov_test34(i8 * %addr, <16 x i32> %mask1) {
; GENERIC-LABEL: mov_test34:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: mov_test34:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -6446,15 +6400,13 @@ define <16 x i32> @mov_test34(i8 * %addr, <16 x i32> %mask1) {
define <16 x i32> @mov_test35(i8 * %addr, <16 x i32> %mask1) {
; GENERIC-LABEL: mov_test35:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: mov_test35:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -6467,15 +6419,13 @@ define <16 x i32> @mov_test35(i8 * %addr, <16 x i32> %mask1) {
define <8 x i64> @mov_test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
; GENERIC-LABEL: mov_test36:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} # sched: [4:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: mov_test36:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} # sched: [8:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%mask = icmp ne <8 x i64> %mask1, zeroinitializer
@@ -6488,15 +6438,13 @@ define <8 x i64> @mov_test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
define <8 x i64> @mov_test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
; GENERIC-LABEL: mov_test37:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} # sched: [4:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: mov_test37:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} # sched: [8:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%mask = icmp ne <8 x i64> %mask1, zeroinitializer
@@ -6509,15 +6457,13 @@ define <8 x i64> @mov_test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
define <8 x i64> @mov_test38(i8 * %addr, <8 x i64> %mask1) {
; GENERIC-LABEL: mov_test38:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: mov_test38:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%mask = icmp ne <8 x i64> %mask1, zeroinitializer
@@ -6530,15 +6476,13 @@ define <8 x i64> @mov_test38(i8 * %addr, <8 x i64> %mask1) {
define <8 x i64> @mov_test39(i8 * %addr, <8 x i64> %mask1) {
; GENERIC-LABEL: mov_test39:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: mov_test39:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%mask = icmp ne <8 x i64> %mask1, zeroinitializer
@@ -8032,8 +7976,7 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) {
define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) {
; GENERIC-LABEL: test_bitcast_v8i1_zext:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k0 # sched: [1:1.00]
; GENERIC-NEXT: kmovb %k0, %eax # sched: [1:0.33]
; GENERIC-NEXT: addl %eax, %eax # sched: [1:0.33]
; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
@@ -8041,8 +7984,7 @@ define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) {
;
; SKX-LABEL: test_bitcast_v8i1_zext:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k0 # sched: [3:1.00]
; SKX-NEXT: kmovb %k0, %eax # sched: [3:1.00]
; SKX-NEXT: addl %eax, %eax # sched: [1:0.25]
; SKX-NEXT: vzeroupper # sched: [4:1.00]
@@ -8058,8 +8000,7 @@ define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) {
define i32 @test_bitcast_v16i1_zext(<16 x i32> %a) {
; GENERIC-LABEL: test_bitcast_v16i1_zext:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k0 # sched: [1:1.00]
; GENERIC-NEXT: kmovw %k0, %eax # sched: [1:0.33]
; GENERIC-NEXT: addl %eax, %eax # sched: [1:0.33]
; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
@@ -8067,8 +8008,7 @@ define i32 @test_bitcast_v16i1_zext(<16 x i32> %a) {
;
; SKX-LABEL: test_bitcast_v16i1_zext:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k0 # sched: [3:1.00]
; SKX-NEXT: kmovw %k0, %eax # sched: [3:1.00]
; SKX-NEXT: addl %eax, %eax # sched: [1:0.25]
; SKX-NEXT: vzeroupper # sched: [4:1.00]
@@ -8292,16 +8232,14 @@ define <16 x float> @_inreg16xfloat(float %a) {
define <16 x float> @_ss16xfloat_mask(float %a, <16 x float> %i, <16 x i32> %mask1) {
; GENERIC-LABEL: _ss16xfloat_mask:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vbroadcastss %xmm0, %zmm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: _ss16xfloat_mask:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vbroadcastss %xmm0, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -8315,15 +8253,13 @@ define <16 x float> @_ss16xfloat_mask(float %a, <16 x float> %i, <16 x i32> %m
define <16 x float> @_ss16xfloat_maskz(float %a, <16 x i32> %mask1) {
; GENERIC-LABEL: _ss16xfloat_maskz:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: _ss16xfloat_maskz:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -8352,15 +8288,13 @@ define <16 x float> @_ss16xfloat_load(float* %a.ptr) {
define <16 x float> @_ss16xfloat_mask_load(float* %a.ptr, <16 x float> %i, <16 x i32> %mask1) {
; GENERIC-LABEL: _ss16xfloat_mask_load:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: _ss16xfloat_mask_load:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} # sched: [8:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%a = load float, float* %a.ptr
@@ -8374,15 +8308,13 @@ define <16 x float> @_ss16xfloat_mask_load(float* %a.ptr, <16 x float> %i, <16
define <16 x float> @_ss16xfloat_maskz_load(float* %a.ptr, <16 x i32> %mask1) {
; GENERIC-LABEL: _ss16xfloat_maskz_load:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: _ss16xfloat_maskz_load:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%a = load float, float* %a.ptr
@@ -8411,16 +8343,14 @@ define <8 x double> @_inreg8xdouble(double %a) {
define <8 x double> @_sd8xdouble_mask(double %a, <8 x double> %i, <8 x i32> %mask1) {
; GENERIC-LABEL: _sd8xdouble_mask:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: _sd8xdouble_mask:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -8434,15 +8364,13 @@ define <8 x double> @_sd8xdouble_mask(double %a, <8 x double> %i, <8 x i32> %m
define <8 x double> @_sd8xdouble_maskz(double %a, <8 x i32> %mask1) {
; GENERIC-LABEL: _sd8xdouble_maskz:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: _sd8xdouble_maskz:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -8471,15 +8399,13 @@ define <8 x double> @_sd8xdouble_load(double* %a.ptr) {
define <8 x double> @_sd8xdouble_mask_load(double* %a.ptr, <8 x double> %i, <8 x i32> %mask1) {
; GENERIC-LABEL: _sd8xdouble_mask_load:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: _sd8xdouble_mask_load:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} # sched: [8:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%a = load double, double* %a.ptr
@@ -8493,15 +8419,13 @@ define <8 x double> @_sd8xdouble_mask_load(double* %a.ptr, <8 x double> %i, <8
define <8 x double> @_sd8xdouble_maskz_load(double* %a.ptr, <8 x i32> %mask1) {
; GENERIC-LABEL: _sd8xdouble_maskz_load:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpneqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmd %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: _sd8xdouble_maskz_load:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpneqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%a = load double, double* %a.ptr
diff --git a/llvm/test/CodeGen/X86/avx512-shuffle-schedule.ll b/llvm/test/CodeGen/X86/avx512-shuffle-schedule.ll
index 6e4bd992718..544d7e2f1a8 100755
--- a/llvm/test/CodeGen/X86/avx512-shuffle-schedule.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffle-schedule.ll
@@ -23,8 +23,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %ve
; GENERIC-LABEL: test_masked_16xi16_perm_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -32,8 +31,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %ve
; SKX-LABEL: test_masked_16xi16_perm_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -47,16 +45,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %
; GENERIC-LABEL: test_masked_z_16xi16_perm_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
@@ -68,8 +64,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %ve
; GENERIC-LABEL: test_masked_16xi16_perm_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -77,8 +72,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %ve
; SKX-LABEL: test_masked_16xi16_perm_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -92,16 +86,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %
; GENERIC-LABEL: test_masked_z_16xi16_perm_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
@@ -113,8 +105,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %ve
; GENERIC-LABEL: test_masked_16xi16_perm_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -122,8 +113,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %ve
; SKX-LABEL: test_masked_16xi16_perm_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -137,16 +127,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %
; GENERIC-LABEL: test_masked_z_16xi16_perm_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
@@ -173,8 +161,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %ve
; GENERIC-LABEL: test_masked_16xi16_perm_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -182,8 +169,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %ve
; SKX-LABEL: test_masked_16xi16_perm_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -197,16 +183,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %
; GENERIC-LABEL: test_masked_z_16xi16_perm_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
@@ -234,16 +218,14 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i16>
; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -257,16 +239,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i1
; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -280,16 +260,14 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i16>
; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -303,16 +281,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i1
; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -326,16 +302,14 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i16>
; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -349,16 +323,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i1
; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -388,16 +360,14 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i16>
; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -411,16 +381,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i1
; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -449,8 +417,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %ve
; GENERIC-LABEL: test_masked_32xi16_perm_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -458,8 +425,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %ve
; SKX-LABEL: test_masked_32xi16_perm_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -473,16 +439,14 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %
; GENERIC-LABEL: test_masked_z_32xi16_perm_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
@@ -494,8 +458,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %ve
; GENERIC-LABEL: test_masked_32xi16_perm_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -503,8 +466,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %ve
; SKX-LABEL: test_masked_32xi16_perm_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -518,16 +480,14 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %
; GENERIC-LABEL: test_masked_z_32xi16_perm_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 8, i32 7, i32 30, i32 11, i32 9, i32 11, i32 30, i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12, i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16>
@@ -539,8 +499,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %ve
; GENERIC-LABEL: test_masked_32xi16_perm_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -548,8 +507,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %ve
; SKX-LABEL: test_masked_32xi16_perm_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -563,16 +521,14 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %
; GENERIC-LABEL: test_masked_z_32xi16_perm_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25, i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16, i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27>
@@ -599,8 +555,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %ve
; GENERIC-LABEL: test_masked_32xi16_perm_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -608,8 +563,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %ve
; SKX-LABEL: test_masked_32xi16_perm_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -623,16 +577,14 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %
; GENERIC-LABEL: test_masked_z_32xi16_perm_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
@@ -660,16 +612,14 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16>
; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -683,16 +633,14 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i1
; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -706,16 +654,14 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16>
; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -729,16 +675,14 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i1
; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -752,16 +696,14 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16>
; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -775,16 +717,14 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i1
; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -814,16 +754,14 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16>
; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -837,16 +775,14 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i1
; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -875,8 +811,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2,
; GENERIC-LABEL: test_masked_8xi32_perm_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -884,8 +819,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2,
; SKX-LABEL: test_masked_8xi32_perm_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -899,16 +833,14 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask
; GENERIC-LABEL: test_masked_z_8xi32_perm_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi32_perm_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
@@ -920,8 +852,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2,
; GENERIC-LABEL: test_masked_8xi32_perm_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -929,8 +860,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2,
; SKX-LABEL: test_masked_8xi32_perm_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -944,16 +874,14 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask
; GENERIC-LABEL: test_masked_z_8xi32_perm_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi32_perm_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 5, i32 1, i32 2, i32 6, i32 0, i32 0, i32 3>
@@ -965,8 +893,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2,
; GENERIC-LABEL: test_masked_8xi32_perm_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -974,8 +901,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2,
; SKX-LABEL: test_masked_8xi32_perm_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -989,16 +915,14 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask
; GENERIC-LABEL: test_masked_z_8xi32_perm_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi32_perm_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 6, i32 5, i32 5, i32 1, i32 7, i32 3, i32 4>
@@ -1025,8 +949,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2,
; GENERIC-LABEL: test_masked_8xi32_perm_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -1034,8 +957,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2,
; SKX-LABEL: test_masked_8xi32_perm_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -1049,16 +971,14 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask
; GENERIC-LABEL: test_masked_z_8xi32_perm_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi32_perm_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
@@ -1086,16 +1006,14 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %ve
; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi32_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -1109,16 +1027,14 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %
; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -1132,16 +1048,14 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %ve
; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi32_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -1155,16 +1069,14 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %
; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -1178,16 +1090,14 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %ve
; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi32_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -1201,16 +1111,14 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %
; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -1240,16 +1148,14 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %ve
; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi32_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -1263,16 +1169,14 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %
; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -1301,8 +1205,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %ve
; GENERIC-LABEL: test_masked_16xi32_perm_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -1310,8 +1213,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %ve
; SKX-LABEL: test_masked_16xi32_perm_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -1325,16 +1227,14 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %
; GENERIC-LABEL: test_masked_z_16xi32_perm_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi32_perm_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
@@ -1346,8 +1246,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %ve
; GENERIC-LABEL: test_masked_16xi32_perm_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -1355,8 +1254,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %ve
; SKX-LABEL: test_masked_16xi32_perm_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -1370,16 +1268,14 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %
; GENERIC-LABEL: test_masked_z_16xi32_perm_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi32_perm_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 10, i32 0, i32 14, i32 15, i32 11, i32 1, i32 1, i32 5, i32 0, i32 5, i32 0, i32 15, i32 13, i32 1, i32 14, i32 3>
@@ -1391,8 +1287,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %ve
; GENERIC-LABEL: test_masked_16xi32_perm_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -1400,8 +1295,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %ve
; SKX-LABEL: test_masked_16xi32_perm_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -1415,16 +1309,14 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %
; GENERIC-LABEL: test_masked_z_16xi32_perm_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi32_perm_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 10, i32 15, i32 1, i32 0, i32 5, i32 0, i32 9, i32 13, i32 2, i32 1, i32 5, i32 15, i32 2, i32 15, i32 5>
@@ -1451,8 +1343,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %ve
; GENERIC-LABEL: test_masked_16xi32_perm_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -1460,8 +1351,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %ve
; SKX-LABEL: test_masked_16xi32_perm_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -1475,16 +1365,14 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %
; GENERIC-LABEL: test_masked_z_16xi32_perm_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi32_perm_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
@@ -1512,16 +1400,14 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32>
; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi32_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -1535,16 +1421,14 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i3
; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -1558,16 +1442,14 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32>
; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi32_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -1581,16 +1463,14 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i3
; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -1604,16 +1484,14 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32>
; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi32_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -1627,16 +1505,14 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i3
; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -1666,16 +1542,14 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32>
; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi32_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -1689,16 +1563,14 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i3
; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -1724,16 +1596,14 @@ define <4 x i64> @test_4xi64_perm_mask0(<4 x i64> %vec) {
define <4 x i64> @test_masked_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_4xi64_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xi64_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1] sched: [3:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -1746,15 +1616,13 @@ define <4 x i64> @test_masked_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %vec2,
define <4 x i64> @test_masked_z_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_4xi64_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xi64_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
@@ -1765,16 +1633,14 @@ define <4 x i64> @test_masked_z_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %mask
define <4 x i64> @test_masked_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_4xi64_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xi64_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3] sched: [3:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -1787,15 +1653,13 @@ define <4 x i64> @test_masked_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %vec2,
define <4 x i64> @test_masked_z_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_4xi64_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xi64_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
@@ -1806,16 +1670,14 @@ define <4 x i64> @test_masked_z_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %mask
define <4 x i64> @test_masked_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_4xi64_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xi64_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1] sched: [3:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -1828,15 +1690,13 @@ define <4 x i64> @test_masked_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %vec2,
define <4 x i64> @test_masked_z_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_4xi64_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xi64_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 1>
@@ -1860,16 +1720,14 @@ define <4 x i64> @test_4xi64_perm_mask3(<4 x i64> %vec) {
define <4 x i64> @test_masked_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_4xi64_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xi64_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3] sched: [3:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -1882,15 +1740,13 @@ define <4 x i64> @test_masked_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %vec2,
define <4 x i64> @test_masked_z_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_4xi64_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xi64_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
@@ -1915,15 +1771,13 @@ define <4 x i64> @test_4xi64_perm_mem_mask0(<4 x i64>* %vp) {
define <4 x i64> @test_masked_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xi64_perm_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x i64>, <4 x i64>* %vp
@@ -1936,15 +1790,13 @@ define <4 x i64> @test_masked_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %ve
define <4 x i64> @test_masked_z_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x i64>, <4 x i64>* %vp
@@ -1957,15 +1809,13 @@ define <4 x i64> @test_masked_z_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %
define <4 x i64> @test_masked_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xi64_perm_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x i64>, <4 x i64>* %vp
@@ -1978,15 +1828,13 @@ define <4 x i64> @test_masked_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %ve
define <4 x i64> @test_masked_z_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x i64>, <4 x i64>* %vp
@@ -1999,15 +1847,13 @@ define <4 x i64> @test_masked_z_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %
define <4 x i64> @test_masked_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xi64_perm_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x i64>, <4 x i64>* %vp
@@ -2020,15 +1866,13 @@ define <4 x i64> @test_masked_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %ve
define <4 x i64> @test_masked_z_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x i64>, <4 x i64>* %vp
@@ -2055,15 +1899,13 @@ define <4 x i64> @test_4xi64_perm_mem_mask3(<4 x i64>* %vp) {
define <4 x i64> @test_masked_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xi64_perm_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x i64>, <4 x i64>* %vp
@@ -2076,15 +1918,13 @@ define <4 x i64> @test_masked_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %ve
define <4 x i64> @test_masked_z_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x i64>, <4 x i64>* %vp
@@ -2113,8 +1953,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %vec2,
; GENERIC-LABEL: test_masked_8xi64_perm_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -2122,8 +1961,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %vec2,
; SKX-LABEL: test_masked_8xi64_perm_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -2137,16 +1975,14 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %mask
; GENERIC-LABEL: test_masked_z_8xi64_perm_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi64_perm_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
@@ -2157,16 +1993,14 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %mask
define <8 x i64> @test_masked_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi64_perm_imm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5] sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -2179,15 +2013,13 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %ve
define <8 x i64> @test_masked_z_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 1, i32 5, i32 4, i32 5, i32 5>
@@ -2199,8 +2031,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %vec2,
; GENERIC-LABEL: test_masked_8xi64_perm_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -2208,8 +2039,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %vec2,
; SKX-LABEL: test_masked_8xi64_perm_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -2223,16 +2053,14 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %mask
; GENERIC-LABEL: test_masked_z_8xi64_perm_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi64_perm_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 3, i32 3, i32 5, i32 4, i32 1>
@@ -2256,16 +2084,14 @@ define <8 x i64> @test_8xi64_perm_imm_mask3(<8 x i64> %vec) {
define <8 x i64> @test_masked_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi64_perm_imm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -2278,15 +2104,13 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %ve
define <8 x i64> @test_masked_z_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
@@ -2298,8 +2122,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %vec2,
; GENERIC-LABEL: test_masked_8xi64_perm_mask4:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -2307,8 +2130,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %vec2,
; SKX-LABEL: test_masked_8xi64_perm_mask4:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -2322,16 +2144,14 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %mask
; GENERIC-LABEL: test_masked_z_8xi64_perm_mask4:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi64_perm_mask4:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 6, i32 3, i32 1, i32 1, i32 7, i32 4, i32 0, i32 3>
@@ -2342,16 +2162,14 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %mask
define <8 x i64> @test_masked_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi64_perm_imm_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4] sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -2364,15 +2182,13 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %ve
define <8 x i64> @test_masked_z_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
@@ -2399,8 +2215,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %vec2,
; GENERIC-LABEL: test_masked_8xi64_perm_mask6:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -2408,8 +2223,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %vec2,
; SKX-LABEL: test_masked_8xi64_perm_mask6:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -2423,16 +2237,14 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %mask
; GENERIC-LABEL: test_masked_z_8xi64_perm_mask6:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi64_perm_mask6:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
@@ -2443,16 +2255,14 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %mask
define <8 x i64> @test_masked_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi64_perm_imm_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7] sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -2465,15 +2275,13 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %ve
define <8 x i64> @test_masked_z_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
@@ -2501,16 +2309,14 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %ve
; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi64_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2524,16 +2330,14 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %
; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2546,15 +2350,13 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %
define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2567,15 +2369,13 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64>
define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2589,16 +2389,14 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %ve
; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi64_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2612,16 +2410,14 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %
; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2648,15 +2444,13 @@ define <8 x i64> @test_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp) {
define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2669,15 +2463,13 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64>
define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2691,16 +2483,14 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %ve
; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask4:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi64_perm_mem_mask4:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2714,16 +2504,14 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %
; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask4:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask4:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2736,15 +2524,13 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %
define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2757,15 +2543,13 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64>
define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2795,16 +2579,14 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %ve
; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask6:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi64_perm_mem_mask6:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2818,16 +2600,14 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %
; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask6:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask6:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2840,15 +2620,13 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %
define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2861,15 +2639,13 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64>
define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2898,8 +2674,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask0(<8 x float> %vec, <8 x float>
; GENERIC-LABEL: test_masked_8xfloat_perm_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -2907,8 +2682,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask0(<8 x float> %vec, <8 x float>
; SKX-LABEL: test_masked_8xfloat_perm_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -2922,16 +2696,14 @@ define <8 x float> @test_masked_z_8xfloat_perm_mask0(<8 x float> %vec, <8 x i32>
; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xfloat_perm_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
@@ -2943,8 +2715,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask1(<8 x float> %vec, <8 x float>
; GENERIC-LABEL: test_masked_8xfloat_perm_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -2952,8 +2723,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask1(<8 x float> %vec, <8 x float>
; SKX-LABEL: test_masked_8xfloat_perm_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -2967,16 +2737,14 @@ define <8 x float> @test_masked_z_8xfloat_perm_mask1(<8 x float> %vec, <8 x i64>
; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xfloat_perm_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 2, i32 1, i32 0, i32 6, i32 0, i32 5, i32 1>
@@ -2988,8 +2756,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask2(<8 x float> %vec, <8 x float>
; GENERIC-LABEL: test_masked_8xfloat_perm_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -2997,8 +2764,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask2(<8 x float> %vec, <8 x float>
; SKX-LABEL: test_masked_8xfloat_perm_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -3012,16 +2778,14 @@ define <8 x float> @test_masked_z_8xfloat_perm_mask2(<8 x float> %vec, <8 x i32>
; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xfloat_perm_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 5, i32 5, i32 5, i32 4, i32 6, i32 0, i32 5>
@@ -3048,8 +2812,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask3(<8 x float> %vec, <8 x float>
; GENERIC-LABEL: test_masked_8xfloat_perm_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -3057,8 +2820,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask3(<8 x float> %vec, <8 x float>
; SKX-LABEL: test_masked_8xfloat_perm_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -3072,16 +2834,14 @@ define <8 x float> @test_masked_z_8xfloat_perm_mask3(<8 x float> %vec, <8 x i32>
; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xfloat_perm_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
@@ -3109,16 +2869,14 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x fl
; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xfloat_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x float>, <8 x float>* %vp
@@ -3132,16 +2890,14 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x
; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x float>, <8 x float>* %vp
@@ -3155,16 +2911,14 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x fl
; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xfloat_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x float>, <8 x float>* %vp
@@ -3178,16 +2932,14 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x
; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x float>, <8 x float>* %vp
@@ -3201,16 +2953,14 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x fl
; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xfloat_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x float>, <8 x float>* %vp
@@ -3224,16 +2974,14 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x
; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x float>, <8 x float>* %vp
@@ -3263,16 +3011,14 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x fl
; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xfloat_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x float>, <8 x float>* %vp
@@ -3286,16 +3032,14 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x
; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x float>, <8 x float>* %vp
@@ -3324,8 +3068,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask0(<16 x float> %vec, <16 x fl
; GENERIC-LABEL: test_masked_16xfloat_perm_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -3333,8 +3076,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask0(<16 x float> %vec, <16 x fl
; SKX-LABEL: test_masked_16xfloat_perm_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -3348,16 +3090,14 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask0(<16 x float> %vec, <16 x
; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xfloat_perm_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
@@ -3369,8 +3109,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask1(<16 x float> %vec, <16 x fl
; GENERIC-LABEL: test_masked_16xfloat_perm_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -3378,8 +3117,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask1(<16 x float> %vec, <16 x fl
; SKX-LABEL: test_masked_16xfloat_perm_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -3393,16 +3131,14 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask1(<16 x float> %vec, <16 x
; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xfloat_perm_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 11, i32 10, i32 4, i32 10, i32 4, i32 5, i32 8, i32 11, i32 2, i32 0, i32 10, i32 0, i32 0, i32 3, i32 10, i32 1>
@@ -3414,8 +3150,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask2(<16 x float> %vec, <16 x fl
; GENERIC-LABEL: test_masked_16xfloat_perm_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -3423,8 +3158,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask2(<16 x float> %vec, <16 x fl
; SKX-LABEL: test_masked_16xfloat_perm_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -3438,16 +3172,14 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask2(<16 x float> %vec, <16 x
; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xfloat_perm_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 15, i32 6, i32 14, i32 3, i32 6, i32 5, i32 2, i32 5, i32 15, i32 11, i32 6, i32 6, i32 4, i32 8, i32 11>
@@ -3474,8 +3206,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask3(<16 x float> %vec, <16 x fl
; GENERIC-LABEL: test_masked_16xfloat_perm_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -3483,8 +3214,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask3(<16 x float> %vec, <16 x fl
; SKX-LABEL: test_masked_16xfloat_perm_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -3498,16 +3228,14 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask3(<16 x float> %vec, <16 x
; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xfloat_perm_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
@@ -3535,16 +3263,14 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16
; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xfloat_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x float>, <16 x float>* %vp
@@ -3558,16 +3284,14 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask0(<16 x float>* %vp, <1
; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x float>, <16 x float>* %vp
@@ -3581,16 +3305,14 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16
; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xfloat_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x float>, <16 x float>* %vp
@@ -3604,16 +3326,14 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask1(<16 x float>* %vp, <1
; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x float>, <16 x float>* %vp
@@ -3627,16 +3347,14 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16
; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xfloat_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x float>, <16 x float>* %vp
@@ -3650,16 +3368,14 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask2(<16 x float>* %vp, <1
; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x float>, <16 x float>* %vp
@@ -3689,16 +3405,14 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16
; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xfloat_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x float>, <16 x float>* %vp
@@ -3712,16 +3426,14 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask3(<16 x float>* %vp, <1
; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x float>, <16 x float>* %vp
@@ -3747,16 +3459,14 @@ define <4 x double> @test_4xdouble_perm_mask0(<4 x double> %vec) {
define <4 x double> @test_masked_4xdouble_perm_mask0(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_4xdouble_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xdouble_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2] sched: [3:1.00]
; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -3769,15 +3479,13 @@ define <4 x double> @test_masked_4xdouble_perm_mask0(<4 x double> %vec, <4 x dou
define <4 x double> @test_masked_z_4xdouble_perm_mask0(<4 x double> %vec, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,2] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xdouble_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,2] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
@@ -3788,16 +3496,14 @@ define <4 x double> @test_masked_z_4xdouble_perm_mask0(<4 x double> %vec, <4 x i
define <4 x double> @test_masked_4xdouble_perm_mask1(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_4xdouble_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xdouble_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0] sched: [3:1.00]
; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -3810,15 +3516,13 @@ define <4 x double> @test_masked_4xdouble_perm_mask1(<4 x double> %vec, <4 x dou
define <4 x double> @test_masked_z_4xdouble_perm_mask1(<4 x double> %vec, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xdouble_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
@@ -3829,16 +3533,14 @@ define <4 x double> @test_masked_z_4xdouble_perm_mask1(<4 x double> %vec, <4 x i
define <4 x double> @test_masked_4xdouble_perm_mask2(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_4xdouble_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xdouble_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1] sched: [3:1.00]
; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -3851,15 +3553,13 @@ define <4 x double> @test_masked_4xdouble_perm_mask2(<4 x double> %vec, <4 x dou
define <4 x double> @test_masked_z_4xdouble_perm_mask2(<4 x double> %vec, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,1] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xdouble_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 3, i32 3, i32 1>
@@ -3883,16 +3583,14 @@ define <4 x double> @test_4xdouble_perm_mask3(<4 x double> %vec) {
define <4 x double> @test_masked_4xdouble_perm_mask3(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_4xdouble_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xdouble_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2] sched: [3:1.00]
; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -3905,15 +3603,13 @@ define <4 x double> @test_masked_4xdouble_perm_mask3(<4 x double> %vec, <4 x dou
define <4 x double> @test_masked_z_4xdouble_perm_mask3(<4 x double> %vec, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,2] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xdouble_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,2] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
@@ -3938,15 +3634,13 @@ define <4 x double> @test_4xdouble_perm_mem_mask0(<4 x double>* %vp) {
define <4 x double> @test_masked_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xdouble_perm_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x double>, <4 x double>* %vp
@@ -3959,15 +3653,13 @@ define <4 x double> @test_masked_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x
define <4 x double> @test_masked_z_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x double>, <4 x double>* %vp
@@ -3980,15 +3672,13 @@ define <4 x double> @test_masked_z_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4
define <4 x double> @test_masked_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xdouble_perm_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x double>, <4 x double>* %vp
@@ -4001,15 +3691,13 @@ define <4 x double> @test_masked_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x
define <4 x double> @test_masked_z_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x double>, <4 x double>* %vp
@@ -4022,15 +3710,13 @@ define <4 x double> @test_masked_z_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4
define <4 x double> @test_masked_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xdouble_perm_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x double>, <4 x double>* %vp
@@ -4043,15 +3729,13 @@ define <4 x double> @test_masked_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x
define <4 x double> @test_masked_z_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x double>, <4 x double>* %vp
@@ -4078,15 +3762,13 @@ define <4 x double> @test_4xdouble_perm_mem_mask3(<4 x double>* %vp) {
define <4 x double> @test_masked_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xdouble_perm_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x double>, <4 x double>* %vp
@@ -4099,15 +3781,13 @@ define <4 x double> @test_masked_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x
define <4 x double> @test_masked_z_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x double>, <4 x double>* %vp
@@ -4136,8 +3816,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask0(<8 x double> %vec, <8 x dou
; GENERIC-LABEL: test_masked_8xdouble_perm_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -4145,8 +3824,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask0(<8 x double> %vec, <8 x dou
; SKX-LABEL: test_masked_8xdouble_perm_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -4160,16 +3838,14 @@ define <8 x double> @test_masked_z_8xdouble_perm_mask0(<8 x double> %vec, <8 x i
; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xdouble_perm_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
@@ -4180,16 +3856,14 @@ define <8 x double> @test_masked_z_8xdouble_perm_mask0(<8 x double> %vec, <8 x i
define <8 x double> @test_masked_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xdouble_perm_imm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6] sched: [3:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -4202,15 +3876,13 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x
define <8 x double> @test_masked_z_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,2,7,4,4,6] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,2,7,4,4,6] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 2, i32 7, i32 4, i32 4, i32 6>
@@ -4222,8 +3894,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask2(<8 x double> %vec, <8 x dou
; GENERIC-LABEL: test_masked_8xdouble_perm_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -4231,8 +3902,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask2(<8 x double> %vec, <8 x dou
; SKX-LABEL: test_masked_8xdouble_perm_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -4246,16 +3916,14 @@ define <8 x double> @test_masked_z_8xdouble_perm_mask2(<8 x double> %vec, <8 x i
; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xdouble_perm_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 5, i32 1, i32 7>
@@ -4279,16 +3947,14 @@ define <8 x double> @test_8xdouble_perm_imm_mask3(<8 x double> %vec) {
define <8 x double> @test_masked_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xdouble_perm_imm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -4301,15 +3967,13 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x
define <8 x double> @test_masked_z_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
@@ -4321,8 +3985,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask4(<8 x double> %vec, <8 x dou
; GENERIC-LABEL: test_masked_8xdouble_perm_mask4:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -4330,8 +3993,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask4(<8 x double> %vec, <8 x dou
; SKX-LABEL: test_masked_8xdouble_perm_mask4:
; SKX: # %bb.0:
; SKX-NEXT: vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -4345,16 +4007,14 @@ define <8 x double> @test_masked_z_8xdouble_perm_mask4(<8 x double> %vec, <8 x i
; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask4:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xdouble_perm_mask4:
; SKX: # %bb.0:
; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 5, i32 3, i32 4, i32 6, i32 5, i32 7, i32 1>
@@ -4365,16 +4025,14 @@ define <8 x double> @test_masked_z_8xdouble_perm_mask4(<8 x double> %vec, <8 x i
define <8 x double> @test_masked_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xdouble_perm_imm_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7] sched: [3:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -4387,15 +4045,13 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x
define <8 x double> @test_masked_z_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,2,3,7,7,6,7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,2,3,7,7,6,7] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7>
@@ -4422,8 +4078,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask6(<8 x double> %vec, <8 x dou
; GENERIC-LABEL: test_masked_8xdouble_perm_mask6:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -4431,8 +4086,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask6(<8 x double> %vec, <8 x dou
; SKX-LABEL: test_masked_8xdouble_perm_mask6:
; SKX: # %bb.0:
; SKX-NEXT: vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -4446,16 +4100,14 @@ define <8 x double> @test_masked_z_8xdouble_perm_mask6(<8 x double> %vec, <8 x i
; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask6:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xdouble_perm_mask6:
; SKX: # %bb.0:
; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
@@ -4466,16 +4118,14 @@ define <8 x double> @test_masked_z_8xdouble_perm_mask6(<8 x double> %vec, <8 x i
define <8 x double> @test_masked_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xdouble_perm_imm_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6] sched: [3:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -4488,15 +4138,13 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x
define <8 x double> @test_masked_z_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,2,7,5,7,6] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,2,7,5,7,6] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 2, i32 7, i32 5, i32 7, i32 6>
@@ -4524,16 +4172,14 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x
; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xdouble_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x double>, <8 x double>* %vp
@@ -4547,16 +4193,14 @@ define <8 x double> @test_masked_z_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8
; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x double>, <8 x double>* %vp
@@ -4569,15 +4213,13 @@ define <8 x double> @test_masked_z_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8
define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x double>, <8 x double>* %vp
@@ -4590,15 +4232,13 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp,
define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x double>, <8 x double>* %vp
@@ -4612,16 +4252,14 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x
; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xdouble_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x double>, <8 x double>* %vp
@@ -4635,16 +4273,14 @@ define <8 x double> @test_masked_z_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8
; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x double>, <8 x double>* %vp
@@ -4671,15 +4307,13 @@ define <8 x double> @test_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp) {
define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x double>, <8 x double>* %vp
@@ -4692,15 +4326,13 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp,
define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x double>, <8 x double>* %vp
@@ -4714,16 +4346,14 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x
; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask4:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xdouble_perm_mem_mask4:
; SKX: # %bb.0:
; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x double>, <8 x double>* %vp
@@ -4737,16 +4367,14 @@ define <8 x double> @test_masked_z_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8
; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask4:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask4:
; SKX: # %bb.0:
; SKX-NEXT: vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x double>, <8 x double>* %vp
@@ -4759,15 +4387,13 @@ define <8 x double> @test_masked_z_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8
define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x double>, <8 x double>* %vp
@@ -4780,15 +4406,13 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp,
define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x double>, <8 x double>* %vp
@@ -4818,16 +4442,14 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x
; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask6:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xdouble_perm_mem_mask6:
; SKX: # %bb.0:
; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x double>, <8 x double>* %vp
@@ -4841,16 +4463,14 @@ define <8 x double> @test_masked_z_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8
; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask6:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask6:
; SKX: # %bb.0:
; SKX-NEXT: vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x double>, <8 x double>* %vp
@@ -4863,15 +4483,13 @@ define <8 x double> @test_masked_z_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8
define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x double>, <8 x double>* %vp
@@ -4884,15 +4502,13 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp,
define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x double>, <8 x double>* %vp
@@ -4918,16 +4534,14 @@ define <16 x i8> @test_16xi8_perm_mask0(<16 x i8> %vec) {
define <16 x i8> @test_masked_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
; GENERIC-LABEL: test_masked_16xi8_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [5:1.00]
; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi8_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00]
; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -4940,15 +4554,13 @@ define <16 x i8> @test_masked_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %vec2,
define <16 x i8> @test_masked_z_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %mask) {
; GENERIC-LABEL: test_masked_z_16xi8_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi8_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
@@ -4959,16 +4571,14 @@ define <16 x i8> @test_masked_z_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %mask
define <16 x i8> @test_masked_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
; GENERIC-LABEL: test_masked_16xi8_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [5:1.00]
; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi8_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:1.00]
; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -4981,15 +4591,13 @@ define <16 x i8> @test_masked_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %vec2,
define <16 x i8> @test_masked_z_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %mask) {
; GENERIC-LABEL: test_masked_z_16xi8_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi8_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
@@ -5000,16 +4608,14 @@ define <16 x i8> @test_masked_z_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %mask
define <16 x i8> @test_masked_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
; GENERIC-LABEL: test_masked_16xi8_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [5:1.00]
; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi8_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:1.00]
; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -5022,15 +4628,13 @@ define <16 x i8> @test_masked_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %vec2,
define <16 x i8> @test_masked_z_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %mask) {
; GENERIC-LABEL: test_masked_z_16xi8_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi8_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
@@ -5054,16 +4658,14 @@ define <16 x i8> @test_16xi8_perm_mask3(<16 x i8> %vec) {
define <16 x i8> @test_masked_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
; GENERIC-LABEL: test_masked_16xi8_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [5:1.00]
; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi8_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00]
; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -5076,15 +4678,13 @@ define <16 x i8> @test_masked_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %vec2,
define <16 x i8> @test_masked_z_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %mask) {
; GENERIC-LABEL: test_masked_z_16xi8_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi8_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
@@ -5112,16 +4712,14 @@ define <16 x i8> @test_masked_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> %ve
; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi8_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i8>, <16 x i8>* %vp
@@ -5135,16 +4733,14 @@ define <16 x i8> @test_masked_z_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> %
; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %xmm0, %xmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %xmm0, %xmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i8>, <16 x i8>* %vp
@@ -5158,16 +4754,14 @@ define <16 x i8> @test_masked_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> %ve
; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi8_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i8>, <16 x i8>* %vp
@@ -5181,16 +4775,14 @@ define <16 x i8> @test_masked_z_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> %
; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %xmm0, %xmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %xmm0, %xmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i8>, <16 x i8>* %vp
@@ -5204,16 +4796,14 @@ define <16 x i8> @test_masked_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> %ve
; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi8_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i8>, <16 x i8>* %vp
@@ -5227,16 +4817,14 @@ define <16 x i8> @test_masked_z_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> %
; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %xmm0, %xmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %xmm0, %xmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i8>, <16 x i8>* %vp
@@ -5266,16 +4854,14 @@ define <16 x i8> @test_masked_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> %ve
; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi8_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i8>, <16 x i8>* %vp
@@ -5289,16 +4875,14 @@ define <16 x i8> @test_masked_z_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> %
; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %xmm0, %xmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %xmm0, %xmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i8>, <16 x i8>* %vp
@@ -5324,16 +4908,14 @@ define <32 x i8> @test_32xi8_perm_mask0(<32 x i8> %vec) {
define <32 x i8> @test_masked_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
; GENERIC-LABEL: test_masked_32xi8_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [5:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi8_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -5346,15 +4928,13 @@ define <32 x i8> @test_masked_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %vec2,
define <32 x i8> @test_masked_z_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %mask) {
; GENERIC-LABEL: test_masked_z_32xi8_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi8_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
@@ -5365,16 +4945,14 @@ define <32 x i8> @test_masked_z_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %mask
define <32 x i8> @test_masked_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
; GENERIC-LABEL: test_masked_32xi8_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [5:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi8_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -5387,15 +4965,13 @@ define <32 x i8> @test_masked_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %vec2,
define <32 x i8> @test_masked_z_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %mask) {
; GENERIC-LABEL: test_masked_z_32xi8_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi8_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 0, i32 4, i32 3, i32 15, i32 5, i32 4, i32 5, i32 15, i32 10, i32 9, i32 11, i32 6, i32 6, i32 10, i32 0, i32 3, i32 21, i32 19, i32 26, i32 22, i32 30, i32 25, i32 22, i32 22, i32 27, i32 22, i32 26, i32 16, i32 23, i32 20, i32 18, i32 24>
@@ -5406,16 +4982,14 @@ define <32 x i8> @test_masked_z_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %mask
define <32 x i8> @test_masked_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
; GENERIC-LABEL: test_masked_32xi8_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [5:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi8_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -5428,15 +5002,13 @@ define <32 x i8> @test_masked_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %vec2,
define <32 x i8> @test_masked_z_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %mask) {
; GENERIC-LABEL: test_masked_z_32xi8_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi8_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 7, i32 8, i32 12, i32 14, i32 7, i32 4, i32 7, i32 12, i32 14, i32 12, i32 3, i32 15, i32 10, i32 1, i32 11, i32 15, i32 22, i32 26, i32 21, i32 19, i32 27, i32 16, i32 29, i32 24, i32 17, i32 17, i32 26, i32 29, i32 20, i32 31, i32 17, i32 29>
@@ -5460,16 +5032,14 @@ define <32 x i8> @test_32xi8_perm_mask3(<32 x i8> %vec) {
define <32 x i8> @test_masked_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
; GENERIC-LABEL: test_masked_32xi8_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [5:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi8_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -5482,15 +5052,13 @@ define <32 x i8> @test_masked_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %vec2,
define <32 x i8> @test_masked_z_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %mask) {
; GENERIC-LABEL: test_masked_z_32xi8_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi8_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
@@ -5518,16 +5086,14 @@ define <32 x i8> @test_masked_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> %ve
; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi8_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i8>, <32 x i8>* %vp
@@ -5541,16 +5107,14 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> %
; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i8>, <32 x i8>* %vp
@@ -5564,16 +5128,14 @@ define <32 x i8> @test_masked_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> %ve
; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi8_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i8>, <32 x i8>* %vp
@@ -5587,16 +5149,14 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> %
; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i8>, <32 x i8>* %vp
@@ -5610,16 +5170,14 @@ define <32 x i8> @test_masked_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> %ve
; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi8_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i8>, <32 x i8>* %vp
@@ -5633,16 +5191,14 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> %
; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i8>, <32 x i8>* %vp
@@ -5672,16 +5228,14 @@ define <32 x i8> @test_masked_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> %ve
; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi8_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i8>, <32 x i8>* %vp
@@ -5695,16 +5249,14 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> %
; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i8>, <32 x i8>* %vp
@@ -5730,16 +5282,14 @@ define <64 x i8> @test_64xi8_perm_mask0(<64 x i8> %vec) {
define <64 x i8> @test_masked_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
; GENERIC-LABEL: test_masked_64xi8_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [5:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_64xi8_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -5752,15 +5302,13 @@ define <64 x i8> @test_masked_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %vec2,
define <64 x i8> @test_masked_z_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %mask) {
; GENERIC-LABEL: test_masked_z_64xi8_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_64xi8_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
@@ -5771,16 +5319,14 @@ define <64 x i8> @test_masked_z_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %mask
define <64 x i8> @test_masked_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
; GENERIC-LABEL: test_masked_64xi8_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [5:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_64xi8_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -5793,15 +5339,13 @@ define <64 x i8> @test_masked_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %vec2,
define <64 x i8> @test_masked_z_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %mask) {
; GENERIC-LABEL: test_masked_z_64xi8_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_64xi8_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 7, i32 14, i32 15, i32 10, i32 9, i32 3, i32 1, i32 13, i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 30, i32 30, i32 22, i32 17, i32 28, i32 27, i32 16, i32 23, i32 26, i32 16, i32 30, i32 31, i32 27, i32 17, i32 17, i32 21, i32 32, i32 37, i32 32, i32 47, i32 45, i32 33, i32 46, i32 35, i32 35, i32 42, i32 47, i32 33, i32 32, i32 37, i32 32, i32 41, i32 61, i32 50, i32 49, i32 53, i32 63, i32 50, i32 63, i32 53, i32 55, i32 52, i32 62, i32 63, i32 58, i32 50, i32 63, i32 49>
@@ -5812,16 +5356,14 @@ define <64 x i8> @test_masked_z_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %mask
define <64 x i8> @test_masked_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
; GENERIC-LABEL: test_masked_64xi8_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [5:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_64xi8_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -5834,15 +5376,13 @@ define <64 x i8> @test_masked_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %vec2,
define <64 x i8> @test_masked_z_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %mask) {
; GENERIC-LABEL: test_masked_z_64xi8_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_64xi8_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12, i32 4, i32 6, i32 0, i32 2, i32 0, i32 1, i32 1, i32 6, i32 24, i32 27, i32 18, i32 22, i32 26, i32 17, i32 23, i32 21, i32 31, i32 16, i32 22, i32 22, i32 27, i32 21, i32 19, i32 20, i32 39, i32 47, i32 44, i32 36, i32 40, i32 43, i32 44, i32 39, i32 38, i32 44, i32 38, i32 35, i32 39, i32 46, i32 34, i32 39, i32 58, i32 55, i32 51, i32 48, i32 59, i32 57, i32 48, i32 52, i32 60, i32 58, i32 56, i32 50, i32 59, i32 55, i32 58, i32 60>
@@ -5866,16 +5406,14 @@ define <64 x i8> @test_64xi8_perm_mask3(<64 x i8> %vec) {
define <64 x i8> @test_masked_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
; GENERIC-LABEL: test_masked_64xi8_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [5:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_64xi8_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -5888,15 +5426,13 @@ define <64 x i8> @test_masked_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %vec2,
define <64 x i8> @test_masked_z_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %mask) {
; GENERIC-LABEL: test_masked_z_64xi8_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_64xi8_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
@@ -5924,16 +5460,14 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %ve
; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_64xi8_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <64 x i8>, <64 x i8>* %vp
@@ -5947,16 +5481,14 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %
; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <64 x i8>, <64 x i8>* %vp
@@ -5970,16 +5502,14 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %ve
; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_64xi8_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <64 x i8>, <64 x i8>* %vp
@@ -5993,16 +5523,14 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %
; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <64 x i8>, <64 x i8>* %vp
@@ -6016,16 +5544,14 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %ve
; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_64xi8_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <64 x i8>, <64 x i8>* %vp
@@ -6039,16 +5565,14 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %
; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <64 x i8>, <64 x i8>* %vp
@@ -6078,16 +5602,14 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %ve
; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_64xi8_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <64 x i8>, <64 x i8>* %vp
@@ -6101,16 +5623,14 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %
; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmb %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <64 x i8>, <64 x i8>* %vp
@@ -6136,16 +5656,14 @@ define <8 x i16> @test_8xi16_perm_high_mask0(<8 x i16> %vec) {
define <8 x i16> @test_masked_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_8xi16_perm_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi16_perm_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -6158,15 +5676,13 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %v
define <8 x i16> @test_masked_z_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi16_perm_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
@@ -6177,16 +5693,14 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16>
define <8 x i16> @test_masked_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_8xi16_perm_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi16_perm_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00]
; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -6199,15 +5713,13 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %ve
define <8 x i16> @test_masked_z_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi16_perm_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
@@ -6218,16 +5730,14 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %
define <8 x i16> @test_masked_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_8xi16_perm_high_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi16_perm_high_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00]
; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -6240,15 +5750,13 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %v
define <8 x i16> @test_masked_z_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi16_perm_high_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 4, i32 5>
@@ -6272,16 +5780,14 @@ define <8 x i16> @test_8xi16_perm_low_mask3(<8 x i16> %vec) {
define <8 x i16> @test_masked_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_8xi16_perm_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi16_perm_low_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -6294,15 +5800,13 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %ve
define <8 x i16> @test_masked_z_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi16_perm_low_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
@@ -6313,16 +5817,14 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %
define <8 x i16> @test_masked_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_8xi16_perm_high_mask4:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi16_perm_high_mask4:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00]
; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -6335,15 +5837,13 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %v
define <8 x i16> @test_masked_z_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask4:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi16_perm_high_mask4:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 7, i32 6>
@@ -6354,16 +5854,14 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16>
define <8 x i16> @test_masked_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_8xi16_perm_low_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi16_perm_low_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00]
; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -6376,15 +5874,13 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %ve
define <8 x i16> @test_masked_z_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi16_perm_low_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 1, i32 4, i32 5, i32 6, i32 7>
@@ -6408,16 +5904,14 @@ define <8 x i16> @test_8xi16_perm_high_mask6(<8 x i16> %vec) {
define <8 x i16> @test_masked_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_8xi16_perm_high_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi16_perm_high_mask6:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -6430,15 +5924,13 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %v
define <8 x i16> @test_masked_z_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi16_perm_high_mask6:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
@@ -6449,16 +5941,14 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16>
define <8 x i16> @test_masked_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_8xi16_perm_low_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi16_perm_low_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00]
; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -6471,15 +5961,13 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %ve
define <8 x i16> @test_masked_z_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi16_perm_low_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
@@ -6504,15 +5992,13 @@ define <8 x i16> @test_8xi16_perm_high_mem_mask0(<8 x i16>* %vp) {
define <8 x i16> @test_masked_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -6525,15 +6011,13 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16
define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -6546,15 +6030,13 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i
define <8 x i16> @test_masked_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -6567,15 +6049,13 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16>
define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -6588,15 +6068,13 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i1
define <8 x i16> @test_masked_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -6609,15 +6087,13 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16
define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -6644,15 +6120,13 @@ define <8 x i16> @test_8xi16_perm_low_mem_mask3(<8 x i16>* %vp) {
define <8 x i16> @test_masked_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -6665,15 +6139,13 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16>
define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -6686,15 +6158,13 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i1
define <8 x i16> @test_masked_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask4:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask4:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -6707,15 +6177,13 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16
define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask4:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask4:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -6728,15 +6196,13 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i
define <8 x i16> @test_masked_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -6749,15 +6215,13 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16>
define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -6784,15 +6248,13 @@ define <8 x i16> @test_8xi16_perm_high_mem_mask6(<8 x i16>* %vp) {
define <8 x i16> @test_masked_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask6:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -6805,15 +6267,13 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16
define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask6:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -6826,15 +6286,13 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i
define <8 x i16> @test_masked_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -6847,15 +6305,13 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16>
define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -6881,16 +6337,14 @@ define <16 x i16> @test_16xi16_perm_high_mask0(<16 x i16> %vec) {
define <16 x i16> @test_masked_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_16xi16_perm_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -6903,15 +6357,13 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16
define <16 x i16> @test_masked_z_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
@@ -6922,16 +6374,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i
define <16 x i16> @test_masked_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_16xi16_perm_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -6944,15 +6394,13 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16>
define <16 x i16> @test_masked_z_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 8, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
@@ -6963,16 +6411,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i1
define <16 x i16> @test_masked_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_16xi16_perm_high_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_high_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -6985,15 +6431,13 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16
define <16 x i16> @test_masked_z_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_high_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 5, i32 5, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 13, i32 13, i32 13>
@@ -7017,16 +6461,14 @@ define <16 x i16> @test_16xi16_perm_low_mask3(<16 x i16> %vec) {
define <16 x i16> @test_masked_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_16xi16_perm_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_low_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -7039,15 +6481,13 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16>
define <16 x i16> @test_masked_z_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_low_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
@@ -7058,16 +6498,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i1
define <16 x i16> @test_masked_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_16xi16_perm_high_mask4:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_high_mask4:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -7080,15 +6518,13 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16
define <16 x i16> @test_masked_z_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask4:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_high_mask4:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 12, i32 15>
@@ -7099,16 +6535,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i
define <16 x i16> @test_masked_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_16xi16_perm_low_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_low_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -7121,15 +6555,13 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16>
define <16 x i16> @test_masked_z_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_low_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
@@ -7153,16 +6585,14 @@ define <16 x i16> @test_16xi16_perm_high_mask6(<16 x i16> %vec) {
define <16 x i16> @test_masked_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_16xi16_perm_high_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_high_mask6:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -7175,15 +6605,13 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16
define <16 x i16> @test_masked_z_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_high_mask6:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
@@ -7194,16 +6622,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i
define <16 x i16> @test_masked_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_16xi16_perm_low_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_low_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -7216,15 +6642,13 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16>
define <16 x i16> @test_masked_z_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_low_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 10, i32 12, i32 13, i32 14, i32 15>
@@ -7249,15 +6673,13 @@ define <16 x i16> @test_16xi16_perm_high_mem_mask0(<16 x i16>* %vp) {
define <16 x i16> @test_masked_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -7270,15 +6692,13 @@ define <16 x i16> @test_masked_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x
define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -7291,15 +6711,13 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16
define <16 x i16> @test_masked_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -7312,15 +6730,13 @@ define <16 x i16> @test_masked_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x
define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -7333,15 +6749,13 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16
define <16 x i16> @test_masked_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -7354,15 +6768,13 @@ define <16 x i16> @test_masked_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x
define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -7389,15 +6801,13 @@ define <16 x i16> @test_16xi16_perm_low_mem_mask3(<16 x i16>* %vp) {
define <16 x i16> @test_masked_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -7410,15 +6820,13 @@ define <16 x i16> @test_masked_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x
define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -7431,15 +6839,13 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16
define <16 x i16> @test_masked_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask4:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask4:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -7452,15 +6858,13 @@ define <16 x i16> @test_masked_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x
define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask4:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask4:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -7473,15 +6877,13 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16
define <16 x i16> @test_masked_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -7494,15 +6896,13 @@ define <16 x i16> @test_masked_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x
define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -7529,15 +6929,13 @@ define <16 x i16> @test_16xi16_perm_high_mem_mask6(<16 x i16>* %vp) {
define <16 x i16> @test_masked_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask6:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -7550,15 +6948,13 @@ define <16 x i16> @test_masked_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x
define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask6:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -7571,15 +6967,13 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16
define <16 x i16> @test_masked_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -7592,15 +6986,13 @@ define <16 x i16> @test_masked_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x
define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -7626,16 +7018,14 @@ define <32 x i16> @test_32xi16_perm_high_mask0(<32 x i16> %vec) {
define <32 x i16> @test_masked_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -7648,15 +7038,13 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16
define <32 x i16> @test_masked_z_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
@@ -7667,16 +7055,14 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i
define <32 x i16> @test_masked_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -7689,15 +7075,13 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16>
define <32 x i16> @test_masked_z_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
@@ -7708,16 +7092,14 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i1
define <32 x i16> @test_masked_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_high_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_high_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -7730,15 +7112,13 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16
define <32 x i16> @test_masked_z_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_high_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 14, i32 12, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 22, i32 20, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 30, i32 28, i32 31>
@@ -7762,16 +7142,14 @@ define <32 x i16> @test_32xi16_perm_low_mask3(<32 x i16> %vec) {
define <32 x i16> @test_masked_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_low_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -7784,15 +7162,13 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16>
define <32 x i16> @test_masked_z_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_low_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -7803,16 +7179,14 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i1
define <32 x i16> @test_masked_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_high_mask4:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_high_mask4:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -7825,15 +7199,13 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16
define <32 x i16> @test_masked_z_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask4:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_high_mask4:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 23, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 31, i32 29, i32 30>
@@ -7844,16 +7216,14 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i
define <32 x i16> @test_masked_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_low_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_low_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -7866,15 +7236,13 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16>
define <32 x i16> @test_masked_z_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_low_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
@@ -7898,16 +7266,14 @@ define <32 x i16> @test_32xi16_perm_high_mask6(<32 x i16> %vec) {
define <32 x i16> @test_masked_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_high_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_high_mask6:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -7920,15 +7286,13 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16
define <32 x i16> @test_masked_z_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_high_mask6:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
@@ -7939,16 +7303,14 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i
define <32 x i16> @test_masked_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_low_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_low_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -7961,15 +7323,13 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16>
define <32 x i16> @test_masked_z_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_low_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 0, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
@@ -7994,15 +7354,13 @@ define <32 x i16> @test_32xi16_perm_high_mem_mask0(<32 x i16>* %vp) {
define <32 x i16> @test_masked_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -8015,15 +7373,13 @@ define <32 x i16> @test_masked_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x
define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -8036,15 +7392,13 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32
define <32 x i16> @test_masked_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -8057,15 +7411,13 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x
define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -8078,15 +7430,13 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32
define <32 x i16> @test_masked_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -8099,15 +7449,13 @@ define <32 x i16> @test_masked_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x
define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -8134,15 +7482,13 @@ define <32 x i16> @test_32xi16_perm_low_mem_mask3(<32 x i16>* %vp) {
define <32 x i16> @test_masked_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -8155,15 +7501,13 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x
define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -8176,15 +7520,13 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32
define <32 x i16> @test_masked_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask4:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask4:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -8197,15 +7539,13 @@ define <32 x i16> @test_masked_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x
define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask4:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask4:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -8219,16 +7559,14 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x
; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask5:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [5:1.00]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1} # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask5:
; SKX: # %bb.0:
; SKX-NEXT: vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1} # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -8242,16 +7580,14 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32
; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask5:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [5:1.00]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask5:
; SKX: # %bb.0:
; SKX-NEXT: vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -8278,15 +7614,13 @@ define <32 x i16> @test_32xi16_perm_high_mem_mask6(<32 x i16>* %vp) {
define <32 x i16> @test_masked_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask6:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -8299,15 +7633,13 @@ define <32 x i16> @test_masked_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x
define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask6:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -8320,15 +7652,13 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32
define <32 x i16> @test_masked_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -8341,15 +7671,13 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x
define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -8375,16 +7703,14 @@ define <4 x i32> @test_4xi32_perm_mask0(<4 x i32> %vec) {
define <4 x i32> @test_masked_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; GENERIC-LABEL: test_masked_4xi32_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xi32_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0] sched: [1:1.00]
; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -8397,15 +7723,13 @@ define <4 x i32> @test_masked_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %vec2,
define <4 x i32> @test_masked_z_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %mask) {
; GENERIC-LABEL: test_masked_z_4xi32_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xi32_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
@@ -8416,16 +7740,14 @@ define <4 x i32> @test_masked_z_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %mask
define <4 x i32> @test_masked_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; GENERIC-LABEL: test_masked_4xi32_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xi32_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0] sched: [1:1.00]
; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -8438,15 +7760,13 @@ define <4 x i32> @test_masked_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %vec2,
define <4 x i32> @test_masked_z_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %mask) {
; GENERIC-LABEL: test_masked_z_4xi32_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xi32_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 0>
@@ -8457,16 +7777,14 @@ define <4 x i32> @test_masked_z_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %mask
define <4 x i32> @test_masked_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; GENERIC-LABEL: test_masked_4xi32_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xi32_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0] sched: [1:1.00]
; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -8479,15 +7797,13 @@ define <4 x i32> @test_masked_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %vec2,
define <4 x i32> @test_masked_z_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %mask) {
; GENERIC-LABEL: test_masked_z_4xi32_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xi32_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 0>
@@ -8511,16 +7827,14 @@ define <4 x i32> @test_4xi32_perm_mask3(<4 x i32> %vec) {
define <4 x i32> @test_masked_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; GENERIC-LABEL: test_masked_4xi32_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xi32_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3] sched: [1:1.00]
; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -8533,15 +7847,13 @@ define <4 x i32> @test_masked_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %vec2,
define <4 x i32> @test_masked_z_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %mask) {
; GENERIC-LABEL: test_masked_z_4xi32_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xi32_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
@@ -8566,15 +7878,13 @@ define <4 x i32> @test_4xi32_perm_mem_mask0(<4 x i32>* %vp) {
define <4 x i32> @test_masked_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xi32_perm_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -8587,15 +7897,13 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %ve
define <4 x i32> @test_masked_z_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %mask) {
; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm0, %xmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -8608,15 +7916,13 @@ define <4 x i32> @test_masked_z_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %
define <4 x i32> @test_masked_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xi32_perm_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -8629,15 +7935,13 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %ve
define <4 x i32> @test_masked_z_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %mask) {
; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm0, %xmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -8650,15 +7954,13 @@ define <4 x i32> @test_masked_z_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %
define <4 x i32> @test_masked_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xi32_perm_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -8671,15 +7973,13 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %ve
define <4 x i32> @test_masked_z_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %mask) {
; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm0, %xmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -8706,15 +8006,13 @@ define <4 x i32> @test_4xi32_perm_mem_mask3(<4 x i32>* %vp) {
define <4 x i32> @test_masked_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xi32_perm_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -8727,15 +8025,13 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %ve
define <4 x i32> @test_masked_z_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %mask) {
; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm0, %xmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -8761,16 +8057,14 @@ define <8 x i32> @test2_8xi32_perm_mask0(<8 x i32> %vec) {
define <8 x i32> @test2_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test2_masked_8xi32_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_8xi32_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -8783,15 +8077,13 @@ define <8 x i32> @test2_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2,
define <8 x i32> @test2_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask) {
; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_z_8xi32_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
@@ -8802,16 +8094,14 @@ define <8 x i32> @test2_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mas
define <8 x i32> @test2_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test2_masked_8xi32_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_8xi32_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -8824,15 +8114,13 @@ define <8 x i32> @test2_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2,
define <8 x i32> @test2_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask) {
; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_z_8xi32_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 3, i32 4, i32 7, i32 7, i32 7>
@@ -8843,16 +8131,14 @@ define <8 x i32> @test2_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mas
define <8 x i32> @test2_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test2_masked_8xi32_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_8xi32_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -8865,15 +8151,13 @@ define <8 x i32> @test2_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2,
define <8 x i32> @test2_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask) {
; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_z_8xi32_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 5, i32 6, i32 4, i32 7>
@@ -8897,16 +8181,14 @@ define <8 x i32> @test2_8xi32_perm_mask3(<8 x i32> %vec) {
define <8 x i32> @test2_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test2_masked_8xi32_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_8xi32_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -8919,15 +8201,13 @@ define <8 x i32> @test2_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2,
define <8 x i32> @test2_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask) {
; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_z_8xi32_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
@@ -8952,15 +8232,13 @@ define <8 x i32> @test2_8xi32_perm_mem_mask0(<8 x i32>* %vp) {
define <8 x i32> @test2_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_8xi32_perm_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -8973,15 +8251,13 @@ define <8 x i32> @test2_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %v
define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %mask) {
; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -8994,15 +8270,13 @@ define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32>
define <8 x i32> @test2_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_8xi32_perm_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -9015,15 +8289,13 @@ define <8 x i32> @test2_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %v
define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %mask) {
; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -9036,15 +8308,13 @@ define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32>
define <8 x i32> @test2_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_8xi32_perm_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -9057,15 +8327,13 @@ define <8 x i32> @test2_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %v
define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %mask) {
; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -9092,15 +8360,13 @@ define <8 x i32> @test2_8xi32_perm_mem_mask3(<8 x i32>* %vp) {
define <8 x i32> @test2_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_8xi32_perm_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -9113,15 +8379,13 @@ define <8 x i32> @test2_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %v
define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %mask) {
; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -9147,16 +8411,14 @@ define <16 x i32> @test2_16xi32_perm_mask0(<16 x i32> %vec) {
define <16 x i32> @test2_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test2_masked_16xi32_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_16xi32_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -9169,15 +8431,13 @@ define <16 x i32> @test2_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %v
define <16 x i32> @test2_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) {
; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_z_16xi32_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
@@ -9188,16 +8448,14 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32>
define <16 x i32> @test2_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test2_masked_16xi32_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_16xi32_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -9210,15 +8468,13 @@ define <16 x i32> @test2_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %v
define <16 x i32> @test2_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %mask) {
; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_z_16xi32_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 0, i32 6, i32 4, i32 7, i32 4, i32 10, i32 8, i32 11, i32 8, i32 14, i32 12, i32 15, i32 12>
@@ -9229,16 +8485,14 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32>
define <16 x i32> @test2_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test2_masked_16xi32_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_16xi32_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -9251,15 +8505,13 @@ define <16 x i32> @test2_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %v
define <16 x i32> @test2_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %mask) {
; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_z_16xi32_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4, i32 9, i32 11, i32 11, i32 8, i32 13, i32 15, i32 15, i32 12>
@@ -9283,16 +8535,14 @@ define <16 x i32> @test2_16xi32_perm_mask3(<16 x i32> %vec) {
define <16 x i32> @test2_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test2_masked_16xi32_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_16xi32_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -9305,15 +8555,13 @@ define <16 x i32> @test2_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %v
define <16 x i32> @test2_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %mask) {
; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_z_16xi32_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
@@ -9338,15 +8586,13 @@ define <16 x i32> @test2_16xi32_perm_mem_mask0(<16 x i32>* %vp) {
define <16 x i32> @test2_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_16xi32_perm_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -9359,15 +8605,13 @@ define <16 x i32> @test2_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32
define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %mask) {
; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -9380,15 +8624,13 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i
define <16 x i32> @test2_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_16xi32_perm_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -9401,15 +8643,13 @@ define <16 x i32> @test2_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32
define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %mask) {
; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -9422,15 +8662,13 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i
define <16 x i32> @test2_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_16xi32_perm_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -9443,15 +8681,13 @@ define <16 x i32> @test2_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32
define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %mask) {
; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -9478,15 +8714,13 @@ define <16 x i32> @test2_16xi32_perm_mem_mask3(<16 x i32>* %vp) {
define <16 x i32> @test2_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_16xi32_perm_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -9499,15 +8733,13 @@ define <16 x i32> @test2_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32
define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %mask) {
; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -9533,16 +8765,14 @@ define <8 x float> @test2_8xfloat_shuff_mask0(<8 x float> %vec1, <8 x float> %ve
define <8 x float> @test2_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_masked_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -9555,15 +8785,13 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x flo
define <8 x float> @test2_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -9574,16 +8802,14 @@ define <8 x float> @test2_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8
define <8 x float> @test2_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_masked_shuff_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -9596,15 +8822,13 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x flo
define <8 x float> @test2_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -9615,16 +8839,14 @@ define <8 x float> @test2_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8
define <8 x float> @test2_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_masked_shuff_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -9637,15 +8859,13 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x flo
define <8 x float> @test2_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -9669,16 +8889,14 @@ define <8 x float> @test2_8xfloat_shuff_mask3(<8 x float> %vec1, <8 x float> %ve
define <8 x float> @test2_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_masked_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -9691,15 +8909,13 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x flo
define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -9724,16 +8940,14 @@ define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>*
define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00]
; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -9747,15 +8961,13 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -9768,16 +8980,14 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1,
define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00]
; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -9791,15 +9001,13 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -9812,16 +9020,14 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1,
define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -9835,15 +9041,13 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -9870,16 +9074,14 @@ define <8 x float> @test_8xfloat_shuff_mem_mask3(<8 x float> %vec1, <8 x float>*
define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -9893,15 +9095,13 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -9927,16 +9127,14 @@ define <16 x float> @test_16xfloat_shuff_mask0(<16 x float> %vec1, <16 x float>
define <16 x float> @test_16xfloat_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -9949,15 +9147,13 @@ define <16 x float> @test_16xfloat_masked_shuff_mask0(<16 x float> %vec1, <16 x
define <16 x float> @test_16xfloat_zero_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
@@ -9968,16 +9164,14 @@ define <16 x float> @test_16xfloat_zero_masked_shuff_mask0(<16 x float> %vec1, <
define <16 x float> @test_16xfloat_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_shuff_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [1:1.00]
; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_shuff_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [3:1.00]
; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -9990,15 +9184,13 @@ define <16 x float> @test_16xfloat_masked_shuff_mask1(<16 x float> %vec1, <16 x
define <16 x float> @test_16xfloat_zero_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 28, i32 29, i32 30, i32 31>
@@ -10009,16 +9201,14 @@ define <16 x float> @test_16xfloat_zero_masked_shuff_mask1(<16 x float> %vec1, <
define <16 x float> @test_16xfloat_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_shuff_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [1:1.00]
; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_shuff_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [3:1.00]
; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10031,15 +9221,13 @@ define <16 x float> @test_16xfloat_masked_shuff_mask2(<16 x float> %vec1, <16 x
define <16 x float> @test_16xfloat_zero_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
@@ -10063,16 +9251,14 @@ define <16 x float> @test_16xfloat_shuff_mask3(<16 x float> %vec1, <16 x float>
define <16 x float> @test_16xfloat_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [1:1.00]
; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [3:1.00]
; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10085,15 +9271,13 @@ define <16 x float> @test_16xfloat_masked_shuff_mask3(<16 x float> %vec1, <16 x
define <16 x float> @test_16xfloat_zero_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
@@ -10118,16 +9302,14 @@ define <16 x float> @test_16xfloat_shuff_mem_mask0(<16 x float> %vec1, <16 x flo
define <16 x float> @test_16xfloat_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [5:1.00]
; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10141,15 +9323,13 @@ define <16 x float> @test_16xfloat_masked_shuff_mem_mask0(<16 x float> %vec1, <1
define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -10162,16 +9342,14 @@ define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask0(<16 x float> %vec
define <16 x float> @test_16xfloat_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [5:1.00]
; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10185,15 +9363,13 @@ define <16 x float> @test_16xfloat_masked_shuff_mem_mask1(<16 x float> %vec1, <1
define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -10206,16 +9382,14 @@ define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask1(<16 x float> %vec
define <16 x float> @test_16xfloat_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [5:1.00]
; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [10:1.00]
; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10229,15 +9403,13 @@ define <16 x float> @test_16xfloat_masked_shuff_mem_mask2(<16 x float> %vec1, <1
define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -10264,16 +9436,14 @@ define <16 x float> @test_16xfloat_shuff_mem_mask3(<16 x float> %vec1, <16 x flo
define <16 x float> @test_16xfloat_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [5:1.00]
; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10287,15 +9457,13 @@ define <16 x float> @test_16xfloat_masked_shuff_mem_mask3(<16 x float> %vec1, <1
define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -10321,16 +9489,14 @@ define <4 x double> @test_4xdouble_shuff_mask0(<4 x double> %vec1, <4 x double>
define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10343,15 +9509,13 @@ define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x d
define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -10362,16 +9526,14 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <
define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10384,15 +9546,13 @@ define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x d
define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -10403,16 +9563,14 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <
define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10425,15 +9583,13 @@ define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x d
define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -10457,16 +9613,14 @@ define <4 x double> @test_4xdouble_shuff_mask3(<4 x double> %vec1, <4 x double>
define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10479,15 +9633,13 @@ define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x d
define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -10512,16 +9664,14 @@ define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x doub
define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10535,15 +9685,13 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -10556,16 +9704,14 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec
define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10579,15 +9725,13 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -10600,16 +9744,14 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec
define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10623,15 +9765,13 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -10658,16 +9798,14 @@ define <4 x double> @test_4xdouble_shuff_mem_mask3(<4 x double> %vec1, <4 x doub
define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10681,15 +9819,13 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -10715,16 +9851,14 @@ define <8 x double> @test_8xdouble_shuff_mask0(<8 x double> %vec1, <8 x double>
define <8 x double> @test_8xdouble_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00]
; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10737,15 +9871,13 @@ define <8 x double> @test_8xdouble_masked_shuff_mask0(<8 x double> %vec1, <8 x d
define <8 x double> @test_8xdouble_zero_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
@@ -10756,16 +9888,14 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mask0(<8 x double> %vec1, <
define <8 x double> @test_8xdouble_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_shuff_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_shuff_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [3:1.00]
; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10778,15 +9908,13 @@ define <8 x double> @test_8xdouble_masked_shuff_mask1(<8 x double> %vec1, <8 x d
define <8 x double> @test_8xdouble_zero_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
@@ -10797,16 +9925,14 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mask1(<8 x double> %vec1, <
define <8 x double> @test_8xdouble_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_shuff_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_shuff_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [3:1.00]
; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10819,15 +9945,13 @@ define <8 x double> @test_8xdouble_masked_shuff_mask2(<8 x double> %vec1, <8 x d
define <8 x double> @test_8xdouble_zero_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 12, i32 13, i32 8, i32 9>
@@ -10851,16 +9975,14 @@ define <8 x double> @test_8xdouble_shuff_mask3(<8 x double> %vec1, <8 x double>
define <8 x double> @test_8xdouble_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00]
; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10873,15 +9995,13 @@ define <8 x double> @test_8xdouble_masked_shuff_mask3(<8 x double> %vec1, <8 x d
define <8 x double> @test_8xdouble_zero_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
@@ -10906,16 +10026,14 @@ define <8 x double> @test_8xdouble_shuff_mem_mask0(<8 x double> %vec1, <8 x doub
define <8 x double> @test_8xdouble_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [5:1.00]
; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10929,15 +10047,13 @@ define <8 x double> @test_8xdouble_masked_shuff_mem_mask0(<8 x double> %vec1, <8
define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -10950,16 +10066,14 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask0(<8 x double> %vec
define <8 x double> @test_8xdouble_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [5:1.00]
; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10973,15 +10087,13 @@ define <8 x double> @test_8xdouble_masked_shuff_mem_mask1(<8 x double> %vec1, <8
define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -10994,16 +10106,14 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask1(<8 x double> %vec
define <8 x double> @test_8xdouble_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [5:1.00]
; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [10:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11017,15 +10127,13 @@ define <8 x double> @test_8xdouble_masked_shuff_mem_mask2(<8 x double> %vec1, <8
define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -11052,16 +10160,14 @@ define <8 x double> @test_8xdouble_shuff_mem_mask3(<8 x double> %vec1, <8 x doub
define <8 x double> @test_8xdouble_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [5:1.00]
; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11075,15 +10181,13 @@ define <8 x double> @test_8xdouble_masked_shuff_mem_mask3(<8 x double> %vec1, <8
define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -11109,16 +10213,14 @@ define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) {
define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11131,15 +10233,13 @@ define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -11150,16 +10250,14 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32>
define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11172,15 +10270,13 @@ define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -11191,16 +10287,14 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32>
define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11213,15 +10307,13 @@ define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -11245,16 +10337,14 @@ define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) {
define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11267,15 +10357,13 @@ define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -11300,16 +10388,14 @@ define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p)
define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11323,15 +10409,13 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
@@ -11344,16 +10428,14 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i
define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11367,15 +10449,13 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
@@ -11388,16 +10468,14 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i
define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11411,15 +10489,13 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
@@ -11446,16 +10522,14 @@ define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p)
define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11469,15 +10543,13 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
@@ -11503,16 +10575,14 @@ define <16 x i32> @test_16xi32_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2) {
define <16 x i32> @test_16xi32_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xi32_masked_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_masked_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11525,15 +10595,13 @@ define <16 x i32> @test_16xi32_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %
define <16 x i32> @test_16xi32_zero_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_zero_masked_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
@@ -11544,16 +10612,14 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mask0(<16 x i32> %vec1, <16 x i
define <16 x i32> @test_16xi32_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xi32_masked_shuff_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_masked_shuff_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11566,15 +10632,13 @@ define <16 x i32> @test_16xi32_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %
define <16 x i32> @test_16xi32_zero_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_zero_masked_shuff_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
@@ -11585,16 +10649,14 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mask1(<16 x i32> %vec1, <16 x i
define <16 x i32> @test_16xi32_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xi32_masked_shuff_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_masked_shuff_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11607,15 +10669,13 @@ define <16 x i32> @test_16xi32_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %
define <16 x i32> @test_16xi32_zero_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_zero_masked_shuff_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
@@ -11639,16 +10699,14 @@ define <16 x i32> @test_16xi32_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2) {
define <16 x i32> @test_16xi32_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xi32_masked_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_masked_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11661,15 +10719,13 @@ define <16 x i32> @test_16xi32_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %
define <16 x i32> @test_16xi32_zero_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_zero_masked_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
@@ -11694,16 +10750,14 @@ define <16 x i32> @test_16xi32_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %ve
define <16 x i32> @test_16xi32_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [5:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_masked_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [10:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11717,15 +10771,13 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i3
define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x i32>, <16 x i32>* %vec2p
@@ -11738,16 +10790,14 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask0(<16 x i32> %vec1, <16
define <16 x i32> @test_16xi32_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [5:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_masked_shuff_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [10:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11761,15 +10811,13 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i3
define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x i32>, <16 x i32>* %vec2p
@@ -11782,16 +10830,14 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask1(<16 x i32> %vec1, <16
define <16 x i32> @test_16xi32_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [5:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_masked_shuff_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11805,15 +10851,13 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i3
define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x i32>, <16 x i32>* %vec2p
@@ -11840,16 +10884,14 @@ define <16 x i32> @test_16xi32_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %ve
define <16 x i32> @test_16xi32_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [5:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_masked_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [10:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11863,15 +10905,13 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i3
define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x i32>, <16 x i32>* %vec2p
@@ -11897,16 +10937,14 @@ define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) {
define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11919,15 +10957,13 @@ define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -11938,16 +10974,14 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64>
define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11960,15 +10994,13 @@ define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -11979,16 +11011,14 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64>
define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12001,15 +11031,13 @@ define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -12033,16 +11061,14 @@ define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) {
define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12055,15 +11081,13 @@ define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -12088,16 +11112,14 @@ define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p)
define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12111,15 +11133,13 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
@@ -12132,16 +11152,14 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i
define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12155,15 +11173,13 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
@@ -12176,16 +11192,14 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i
define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12199,15 +11213,13 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
@@ -12234,16 +11246,14 @@ define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p)
define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12257,15 +11267,13 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
@@ -12291,16 +11299,14 @@ define <8 x i64> @test_8xi64_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2) {
define <8 x i64> @test_8xi64_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xi64_masked_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_masked_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12313,15 +11319,13 @@ define <8 x i64> @test_8xi64_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2
define <8 x i64> @test_8xi64_zero_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_zero_masked_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
@@ -12332,16 +11336,14 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64>
define <8 x i64> @test_8xi64_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xi64_masked_shuff_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_masked_shuff_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12354,15 +11356,13 @@ define <8 x i64> @test_8xi64_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2
define <8 x i64> @test_8xi64_zero_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_zero_masked_shuff_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 12, i32 13>
@@ -12373,16 +11373,14 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64>
define <8 x i64> @test_8xi64_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xi64_masked_shuff_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_masked_shuff_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12395,15 +11393,13 @@ define <8 x i64> @test_8xi64_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2
define <8 x i64> @test_8xi64_zero_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_zero_masked_shuff_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9>
@@ -12427,16 +11423,14 @@ define <8 x i64> @test_8xi64_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2) {
define <8 x i64> @test_8xi64_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xi64_masked_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_masked_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12449,15 +11443,13 @@ define <8 x i64> @test_8xi64_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2
define <8 x i64> @test_8xi64_zero_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_zero_masked_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
@@ -12482,16 +11474,14 @@ define <8 x i64> @test_8xi64_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p)
define <8 x i64> @test_8xi64_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [5:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_masked_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12505,15 +11495,13 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>*
define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i64>, <8 x i64>* %vec2p
@@ -12526,16 +11514,14 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i
define <8 x i64> @test_8xi64_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [5:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_masked_shuff_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [10:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12549,15 +11535,13 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>*
define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i64>, <8 x i64>* %vec2p
@@ -12570,16 +11554,14 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i
define <8 x i64> @test_8xi64_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [5:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_masked_shuff_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [10:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12593,15 +11575,13 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>*
define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i64>, <8 x i64>* %vec2p
@@ -12628,16 +11608,14 @@ define <8 x i64> @test_8xi64_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p)
define <8 x i64> @test_8xi64_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [5:1.00]
; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_masked_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12651,15 +11629,13 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>*
define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i64>, <8 x i64>* %vec2p
@@ -12685,16 +11661,14 @@ define <4 x float> @test_4xfloat_unpack_low_mask0(<4 x float> %vec1, <4 x float>
define <4 x float> @test_4xfloat_masked_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00]
; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_masked_unpack_low_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12707,15 +11681,13 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mask0(<4 x float> %vec1, <4 x
define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -12726,16 +11698,14 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask0(<4 x float> %vec1,
define <4 x float> @test_4xfloat_masked_unpack_low_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00]
; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_masked_unpack_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12748,15 +11718,13 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mask1(<4 x float> %vec1, <4 x
define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -12767,16 +11735,14 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask1(<4 x float> %vec1,
define <4 x float> @test_4xfloat_masked_unpack_low_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00]
; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_masked_unpack_low_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12789,15 +11755,13 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mask2(<4 x float> %vec1, <4 x
define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -12821,16 +11785,14 @@ define <4 x float> @test_4xfloat_unpack_low_mask3(<4 x float> %vec1, <4 x float>
define <4 x float> @test_4xfloat_masked_unpack_low_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00]
; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_masked_unpack_low_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12843,15 +11805,13 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mask3(<4 x float> %vec1, <4 x
define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -12876,16 +11836,14 @@ define <4 x float> @test_4xfloat_unpack_low_mem_mask0(<4 x float> %vec1, <4 x fl
define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12899,15 +11857,13 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask0(<4 x float> %vec1,
define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -12920,16 +11876,14 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask0(<4 x float> %v
define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12943,15 +11897,13 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask1(<4 x float> %vec1,
define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -12964,16 +11916,14 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask1(<4 x float> %v
define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12987,15 +11937,13 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask2(<4 x float> %vec1,
define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -13022,16 +11970,14 @@ define <4 x float> @test_4xfloat_unpack_low_mem_mask3(<4 x float> %vec1, <4 x fl
define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13045,15 +11991,13 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask3(<4 x float> %vec1,
define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -13079,16 +12023,14 @@ define <8 x float> @test_8xfloat_unpack_low_mask0(<8 x float> %vec1, <8 x float>
define <8 x float> @test_8xfloat_masked_unpack_low_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00]
; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_unpack_low_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13101,15 +12043,13 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mask0(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
@@ -13120,16 +12060,14 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask0(<8 x float> %vec1,
define <8 x float> @test_8xfloat_masked_unpack_low_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00]
; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_unpack_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13142,15 +12080,13 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mask1(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
@@ -13161,16 +12097,14 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask1(<8 x float> %vec1,
define <8 x float> @test_8xfloat_masked_unpack_low_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00]
; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_unpack_low_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13183,15 +12117,13 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mask2(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
@@ -13215,16 +12147,14 @@ define <8 x float> @test_8xfloat_unpack_low_mask3(<8 x float> %vec1, <8 x float>
define <8 x float> @test_8xfloat_masked_unpack_low_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00]
; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_unpack_low_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13237,15 +12167,13 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mask3(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
@@ -13270,16 +12198,14 @@ define <8 x float> @test_8xfloat_unpack_low_mem_mask0(<8 x float> %vec1, <8 x fl
define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13293,15 +12219,13 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask0(<8 x float> %vec1,
define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -13314,16 +12238,14 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask0(<8 x float> %v
define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13337,15 +12259,13 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask1(<8 x float> %vec1,
define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -13358,16 +12278,14 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask1(<8 x float> %v
define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13381,15 +12299,13 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask2(<8 x float> %vec1,
define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -13416,16 +12332,14 @@ define <8 x float> @test_8xfloat_unpack_low_mem_mask3(<8 x float> %vec1, <8 x fl
define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13439,15 +12353,13 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask3(<8 x float> %vec1,
define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -13473,16 +12385,14 @@ define <16 x float> @test_16xfloat_unpack_low_mask0(<16 x float> %vec1, <16 x fl
define <16 x float> @test_16xfloat_masked_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_unpack_low_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13495,15 +12405,13 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mask0(<16 x float> %vec1, <
define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
@@ -13514,16 +12422,14 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask0(<16 x float> %ve
define <16 x float> @test_16xfloat_masked_unpack_low_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_unpack_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13536,15 +12442,13 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mask1(<16 x float> %vec1, <
define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
@@ -13555,16 +12459,14 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask1(<16 x float> %ve
define <16 x float> @test_16xfloat_masked_unpack_low_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_unpack_low_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13577,15 +12479,13 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mask2(<16 x float> %vec1, <
define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
@@ -13609,16 +12509,14 @@ define <16 x float> @test_16xfloat_unpack_low_mask3(<16 x float> %vec1, <16 x fl
define <16 x float> @test_16xfloat_masked_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_unpack_low_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13631,15 +12529,13 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mask3(<16 x float> %vec1, <
define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
@@ -13664,16 +12560,14 @@ define <16 x float> @test_16xfloat_unpack_low_mem_mask0(<16 x float> %vec1, <16
define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13687,15 +12581,13 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask0(<16 x float> %vec
define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -13708,16 +12600,14 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask0(<16 x float>
define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13731,15 +12621,13 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask1(<16 x float> %vec
define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -13752,16 +12640,14 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask1(<16 x float>
define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13775,15 +12661,13 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask2(<16 x float> %vec
define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -13810,16 +12694,14 @@ define <16 x float> @test_16xfloat_unpack_low_mem_mask3(<16 x float> %vec1, <16
define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13833,15 +12715,13 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask3(<16 x float> %vec
define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -13867,16 +12747,14 @@ define <2 x double> @test_2xdouble_unpack_low_mask0(<2 x double> %vec1, <2 x dou
define <2 x double> @test_2xdouble_masked_unpack_low_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) {
; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %xmm3, %xmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [3:1.00]
; GENERIC-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_2xdouble_masked_unpack_low_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %xmm3, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00]
; SKX-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13889,15 +12767,13 @@ define <2 x double> @test_2xdouble_masked_unpack_low_mask0(<2 x double> %vec1, <
define <2 x double> @test_2xdouble_zero_masked_unpack_low_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) {
; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
@@ -13908,16 +12784,14 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_low_mask0(<2 x double> %ve
define <2 x double> @test_2xdouble_masked_unpack_low_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) {
; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %xmm3, %xmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [3:1.00]
; GENERIC-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_2xdouble_masked_unpack_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %xmm3, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00]
; SKX-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13930,15 +12804,13 @@ define <2 x double> @test_2xdouble_masked_unpack_low_mask1(<2 x double> %vec1, <
define <2 x double> @test_2xdouble_zero_masked_unpack_low_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) {
; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
@@ -13963,16 +12835,14 @@ define <2 x double> @test_2xdouble_unpack_low_mem_mask0(<2 x double> %vec1, <2 x
define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) {
; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
; GENERIC-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_2xdouble_masked_unpack_low_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
; SKX-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13986,15 +12856,13 @@ define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask0(<2 x double> %vec
define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) {
; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <2 x double>, <2 x double>* %vec2p
@@ -14007,16 +12875,14 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask0(<2 x double>
define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) {
; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
; GENERIC-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_2xdouble_masked_unpack_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
; SKX-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14030,15 +12896,13 @@ define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask1(<2 x double> %vec
define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) {
; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <2 x double>, <2 x double>* %vec2p
@@ -14064,16 +12928,14 @@ define <4 x double> @test_4xdouble_unpack_low_mask0(<4 x double> %vec1, <4 x dou
define <4 x double> @test_4xdouble_masked_unpack_low_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00]
; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_unpack_low_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14086,15 +12948,13 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mask0(<4 x double> %vec1, <
define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -14105,16 +12965,14 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask0(<4 x double> %ve
define <4 x double> @test_4xdouble_masked_unpack_low_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00]
; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_unpack_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14127,15 +12985,13 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mask1(<4 x double> %vec1, <
define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -14146,16 +13002,14 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask1(<4 x double> %ve
define <4 x double> @test_4xdouble_masked_unpack_low_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00]
; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_unpack_low_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14168,15 +13022,13 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mask2(<4 x double> %vec1, <
define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -14200,16 +13052,14 @@ define <4 x double> @test_4xdouble_unpack_low_mask3(<4 x double> %vec1, <4 x dou
define <4 x double> @test_4xdouble_masked_unpack_low_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00]
; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_unpack_low_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14222,15 +13072,13 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mask3(<4 x double> %vec1, <
define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -14255,16 +13103,14 @@ define <4 x double> @test_4xdouble_unpack_low_mem_mask0(<4 x double> %vec1, <4 x
define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14278,15 +13124,13 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask0(<4 x double> %vec
define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -14299,16 +13143,14 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask0(<4 x double>
define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14322,15 +13164,13 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask1(<4 x double> %vec
define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -14343,16 +13183,14 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask1(<4 x double>
define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14366,15 +13204,13 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask2(<4 x double> %vec
define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -14401,16 +13237,14 @@ define <4 x double> @test_4xdouble_unpack_low_mem_mask3(<4 x double> %vec1, <4 x
define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14424,15 +13258,13 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask3(<4 x double> %vec
define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -14458,16 +13290,14 @@ define <8 x double> @test_8xdouble_unpack_low_mask0(<8 x double> %vec1, <8 x dou
define <8 x double> @test_8xdouble_masked_unpack_low_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_unpack_low_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14480,15 +13310,13 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mask0(<8 x double> %vec1, <
define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -14499,16 +13327,14 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask0(<8 x double> %ve
define <8 x double> @test_8xdouble_masked_unpack_low_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_unpack_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14521,15 +13347,13 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mask1(<8 x double> %vec1, <
define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -14540,16 +13364,14 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask1(<8 x double> %ve
define <8 x double> @test_8xdouble_masked_unpack_low_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_unpack_low_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14562,15 +13384,13 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mask2(<8 x double> %vec1, <
define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -14594,16 +13414,14 @@ define <8 x double> @test_8xdouble_unpack_low_mask3(<8 x double> %vec1, <8 x dou
define <8 x double> @test_8xdouble_masked_unpack_low_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_unpack_low_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14616,15 +13434,13 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mask3(<8 x double> %vec1, <
define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -14649,16 +13465,14 @@ define <8 x double> @test_8xdouble_unpack_low_mem_mask0(<8 x double> %vec1, <8 x
define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14672,15 +13486,13 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask0(<8 x double> %vec
define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -14693,16 +13505,14 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask0(<8 x double>
define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14716,15 +13526,13 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask1(<8 x double> %vec
define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -14737,16 +13545,14 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask1(<8 x double>
define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14760,15 +13566,13 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask2(<8 x double> %vec
define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -14795,16 +13599,14 @@ define <8 x double> @test_8xdouble_unpack_low_mem_mask3(<8 x double> %vec1, <8 x
define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14818,15 +13620,13 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask3(<8 x double> %vec
define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -14852,16 +13652,14 @@ define <4 x float> @test_4xfloat_unpack_high_mask0(<4 x float> %vec1, <4 x float
define <4 x float> @test_4xfloat_masked_unpack_high_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00]
; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_masked_unpack_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14874,15 +13672,13 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mask0(<4 x float> %vec1, <4
define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -14893,16 +13689,14 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask0(<4 x float> %vec1
define <4 x float> @test_4xfloat_masked_unpack_high_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00]
; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_masked_unpack_high_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14915,15 +13709,13 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mask1(<4 x float> %vec1, <4
define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -14934,16 +13726,14 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask1(<4 x float> %vec1
define <4 x float> @test_4xfloat_masked_unpack_high_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00]
; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_masked_unpack_high_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14956,15 +13746,13 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mask2(<4 x float> %vec1, <4
define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -14988,16 +13776,14 @@ define <4 x float> @test_4xfloat_unpack_high_mask3(<4 x float> %vec1, <4 x float
define <4 x float> @test_4xfloat_masked_unpack_high_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00]
; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_masked_unpack_high_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15010,15 +13796,13 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mask3(<4 x float> %vec1, <4
define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -15043,16 +13827,14 @@ define <4 x float> @test_4xfloat_unpack_high_mem_mask0(<4 x float> %vec1, <4 x f
define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15066,15 +13848,13 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask0(<4 x float> %vec1,
define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -15087,16 +13867,14 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask0(<4 x float> %
define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15110,15 +13888,13 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask1(<4 x float> %vec1,
define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -15131,16 +13907,14 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask1(<4 x float> %
define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15154,15 +13928,13 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask2(<4 x float> %vec1,
define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -15189,16 +13961,14 @@ define <4 x float> @test_4xfloat_unpack_high_mem_mask3(<4 x float> %vec1, <4 x f
define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15212,15 +13982,13 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask3(<4 x float> %vec1,
define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -15246,16 +14014,14 @@ define <8 x float> @test_8xfloat_unpack_high_mask0(<8 x float> %vec1, <8 x float
define <8 x float> @test_8xfloat_masked_unpack_high_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00]
; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_unpack_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15268,15 +14034,13 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mask0(<8 x float> %vec1, <8
define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
@@ -15287,16 +14051,14 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask0(<8 x float> %vec1
define <8 x float> @test_8xfloat_masked_unpack_high_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00]
; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_unpack_high_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15309,15 +14071,13 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mask1(<8 x float> %vec1, <8
define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
@@ -15328,16 +14088,14 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask1(<8 x float> %vec1
define <8 x float> @test_8xfloat_masked_unpack_high_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00]
; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_unpack_high_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15350,15 +14108,13 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mask2(<8 x float> %vec1, <8
define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
@@ -15382,16 +14138,14 @@ define <8 x float> @test_8xfloat_unpack_high_mask3(<8 x float> %vec1, <8 x float
define <8 x float> @test_8xfloat_masked_unpack_high_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00]
; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_unpack_high_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15404,15 +14158,13 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mask3(<8 x float> %vec1, <8
define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
@@ -15437,16 +14189,14 @@ define <8 x float> @test_8xfloat_unpack_high_mem_mask0(<8 x float> %vec1, <8 x f
define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15460,15 +14210,13 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask0(<8 x float> %vec1,
define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -15481,16 +14229,14 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask0(<8 x float> %
define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15504,15 +14250,13 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask1(<8 x float> %vec1,
define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -15525,16 +14269,14 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask1(<8 x float> %
define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15548,15 +14290,13 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask2(<8 x float> %vec1,
define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -15583,16 +14323,14 @@ define <8 x float> @test_8xfloat_unpack_high_mem_mask3(<8 x float> %vec1, <8 x f
define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15606,15 +14344,13 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask3(<8 x float> %vec1,
define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -15640,16 +14376,14 @@ define <16 x float> @test_16xfloat_unpack_high_mask0(<16 x float> %vec1, <16 x f
define <16 x float> @test_16xfloat_masked_unpack_high_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_unpack_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15662,15 +14396,13 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mask0(<16 x float> %vec1,
define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
@@ -15681,16 +14413,14 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask0(<16 x float> %v
define <16 x float> @test_16xfloat_masked_unpack_high_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_unpack_high_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15703,15 +14433,13 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mask1(<16 x float> %vec1,
define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
@@ -15722,16 +14450,14 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask1(<16 x float> %v
define <16 x float> @test_16xfloat_masked_unpack_high_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_unpack_high_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15744,15 +14470,13 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mask2(<16 x float> %vec1,
define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
@@ -15776,16 +14500,14 @@ define <16 x float> @test_16xfloat_unpack_high_mask3(<16 x float> %vec1, <16 x f
define <16 x float> @test_16xfloat_masked_unpack_high_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_unpack_high_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15798,15 +14520,13 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mask3(<16 x float> %vec1,
define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
@@ -15831,16 +14551,14 @@ define <16 x float> @test_16xfloat_unpack_high_mem_mask0(<16 x float> %vec1, <16
define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15854,15 +14572,13 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask0(<16 x float> %ve
define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -15875,16 +14591,14 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask0(<16 x float
define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15898,15 +14612,13 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask1(<16 x float> %ve
define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -15919,16 +14631,14 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask1(<16 x float
define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15942,15 +14652,13 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask2(<16 x float> %ve
define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -15977,16 +14685,14 @@ define <16 x float> @test_16xfloat_unpack_high_mem_mask3(<16 x float> %vec1, <16
define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16000,15 +14706,13 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask3(<16 x float> %ve
define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -16034,16 +14738,14 @@ define <2 x double> @test_2xdouble_unpack_high_mask0(<2 x double> %vec1, <2 x do
define <2 x double> @test_2xdouble_masked_unpack_high_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) {
; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %xmm3, %xmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [3:1.00]
; GENERIC-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_2xdouble_masked_unpack_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %xmm3, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00]
; SKX-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16056,15 +14758,13 @@ define <2 x double> @test_2xdouble_masked_unpack_high_mask0(<2 x double> %vec1,
define <2 x double> @test_2xdouble_zero_masked_unpack_high_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) {
; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
@@ -16075,16 +14775,14 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_high_mask0(<2 x double> %v
define <2 x double> @test_2xdouble_masked_unpack_high_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) {
; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %xmm3, %xmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [3:1.00]
; GENERIC-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_2xdouble_masked_unpack_high_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %xmm3, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00]
; SKX-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16097,15 +14795,13 @@ define <2 x double> @test_2xdouble_masked_unpack_high_mask1(<2 x double> %vec1,
define <2 x double> @test_2xdouble_zero_masked_unpack_high_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) {
; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
@@ -16130,16 +14826,14 @@ define <2 x double> @test_2xdouble_unpack_high_mem_mask0(<2 x double> %vec1, <2
define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) {
; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
; GENERIC-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_2xdouble_masked_unpack_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
; SKX-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16153,15 +14847,13 @@ define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask0(<2 x double> %ve
define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) {
; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <2 x double>, <2 x double>* %vec2p
@@ -16174,16 +14866,14 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask0(<2 x double
define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) {
; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
; GENERIC-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_2xdouble_masked_unpack_high_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
; SKX-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16197,15 +14887,13 @@ define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask1(<2 x double> %ve
define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) {
; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %xmm1, %xmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <2 x double>, <2 x double>* %vec2p
@@ -16231,16 +14919,14 @@ define <4 x double> @test_4xdouble_unpack_high_mask0(<4 x double> %vec1, <4 x do
define <4 x double> @test_4xdouble_masked_unpack_high_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00]
; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_unpack_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16253,15 +14939,13 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mask0(<4 x double> %vec1,
define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -16272,16 +14956,14 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask0(<4 x double> %v
define <4 x double> @test_4xdouble_masked_unpack_high_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00]
; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_unpack_high_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16294,15 +14976,13 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mask1(<4 x double> %vec1,
define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -16313,16 +14993,14 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask1(<4 x double> %v
define <4 x double> @test_4xdouble_masked_unpack_high_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00]
; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_unpack_high_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16335,15 +15013,13 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mask2(<4 x double> %vec1,
define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -16367,16 +15043,14 @@ define <4 x double> @test_4xdouble_unpack_high_mask3(<4 x double> %vec1, <4 x do
define <4 x double> @test_4xdouble_masked_unpack_high_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00]
; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_unpack_high_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16389,15 +15063,13 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mask3(<4 x double> %vec1,
define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -16422,16 +15094,14 @@ define <4 x double> @test_4xdouble_unpack_high_mem_mask0(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16445,15 +15115,13 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask0(<4 x double> %ve
define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -16466,16 +15134,14 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask0(<4 x double
define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16489,15 +15155,13 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask1(<4 x double> %ve
define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -16510,16 +15174,14 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask1(<4 x double
define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16533,15 +15195,13 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask2(<4 x double> %ve
define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -16568,16 +15228,14 @@ define <4 x double> @test_4xdouble_unpack_high_mem_mask3(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16591,15 +15249,13 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask3(<4 x double> %ve
define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -16625,16 +15281,14 @@ define <8 x double> @test_8xdouble_unpack_high_mask0(<8 x double> %vec1, <8 x do
define <8 x double> @test_8xdouble_masked_unpack_high_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_unpack_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16647,15 +15301,13 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mask0(<8 x double> %vec1,
define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -16666,16 +15318,14 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask0(<8 x double> %v
define <8 x double> @test_8xdouble_masked_unpack_high_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_unpack_high_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16688,15 +15338,13 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mask1(<8 x double> %vec1,
define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -16707,16 +15355,14 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask1(<8 x double> %v
define <8 x double> @test_8xdouble_masked_unpack_high_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_unpack_high_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16729,15 +15375,13 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mask2(<8 x double> %vec1,
define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -16761,16 +15405,14 @@ define <8 x double> @test_8xdouble_unpack_high_mask3(<8 x double> %vec1, <8 x do
define <8 x double> @test_8xdouble_masked_unpack_high_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_unpack_high_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16783,15 +15425,13 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mask3(<8 x double> %vec1,
define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -16816,16 +15456,14 @@ define <8 x double> @test_8xdouble_unpack_high_mem_mask0(<8 x double> %vec1, <8
define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16839,15 +15477,13 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask0(<8 x double> %ve
define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -16860,16 +15496,14 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask0(<8 x double
define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16883,15 +15517,13 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask1(<8 x double> %ve
define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -16904,16 +15536,14 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask1(<8 x double
define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16927,15 +15557,13 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask2(<8 x double> %ve
define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -16962,16 +15590,14 @@ define <8 x double> @test_8xdouble_unpack_high_mem_mask3(<8 x double> %vec1, <8
define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16985,15 +15611,13 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask3(<8 x double> %ve
define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-int.ll b/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-int.ll
index b31302d51ff..74005debfed 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-int.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-int.ll
@@ -13,8 +13,7 @@ define <16 x i8> @test_i8_to_16(i8 %s) {
define <16 x i8> @test_masked_i8_to_16_mask0(i8 %s, <16 x i8> %default, <16 x i8> %mask) {
; CHECK-LABEL: test_masked_i8_to_16_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmb %xmm1, %xmm1, %k1
; CHECK-NEXT: vpbroadcastb %edi, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -27,8 +26,7 @@ define <16 x i8> @test_masked_i8_to_16_mask0(i8 %s, <16 x i8> %default, <16 x i8
define <16 x i8> @test_masked_z_i8_to_16_mask0(i8 %s, <16 x i8> %mask) {
; CHECK-LABEL: test_masked_z_i8_to_16_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmb %xmm0, %xmm0, %k1
; CHECK-NEXT: vpbroadcastb %edi, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -40,8 +38,7 @@ define <16 x i8> @test_masked_z_i8_to_16_mask0(i8 %s, <16 x i8> %mask) {
define <16 x i8> @test_masked_i8_to_16_mask1(i8 %s, <16 x i8> %default, <16 x i8> %mask) {
; CHECK-LABEL: test_masked_i8_to_16_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmb %xmm1, %xmm1, %k1
; CHECK-NEXT: vpbroadcastb %edi, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -54,8 +51,7 @@ define <16 x i8> @test_masked_i8_to_16_mask1(i8 %s, <16 x i8> %default, <16 x i8
define <16 x i8> @test_masked_z_i8_to_16_mask1(i8 %s, <16 x i8> %mask) {
; CHECK-LABEL: test_masked_z_i8_to_16_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmb %xmm0, %xmm0, %k1
; CHECK-NEXT: vpbroadcastb %edi, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -67,8 +63,7 @@ define <16 x i8> @test_masked_z_i8_to_16_mask1(i8 %s, <16 x i8> %mask) {
define <16 x i8> @test_masked_i8_to_16_mask2(i8 %s, <16 x i8> %default, <16 x i8> %mask) {
; CHECK-LABEL: test_masked_i8_to_16_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmb %xmm1, %xmm1, %k1
; CHECK-NEXT: vpbroadcastb %edi, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -81,8 +76,7 @@ define <16 x i8> @test_masked_i8_to_16_mask2(i8 %s, <16 x i8> %default, <16 x i8
define <16 x i8> @test_masked_z_i8_to_16_mask2(i8 %s, <16 x i8> %mask) {
; CHECK-LABEL: test_masked_z_i8_to_16_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmb %xmm0, %xmm0, %k1
; CHECK-NEXT: vpbroadcastb %edi, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -94,8 +88,7 @@ define <16 x i8> @test_masked_z_i8_to_16_mask2(i8 %s, <16 x i8> %mask) {
define <16 x i8> @test_masked_i8_to_16_mask3(i8 %s, <16 x i8> %default, <16 x i8> %mask) {
; CHECK-LABEL: test_masked_i8_to_16_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmb %xmm1, %xmm1, %k1
; CHECK-NEXT: vpbroadcastb %edi, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -108,8 +101,7 @@ define <16 x i8> @test_masked_i8_to_16_mask3(i8 %s, <16 x i8> %default, <16 x i8
define <16 x i8> @test_masked_z_i8_to_16_mask3(i8 %s, <16 x i8> %mask) {
; CHECK-LABEL: test_masked_z_i8_to_16_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmb %xmm0, %xmm0, %k1
; CHECK-NEXT: vpbroadcastb %edi, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -130,8 +122,7 @@ define <32 x i8> @test_i8_to_32(i8 %s) {
define <32 x i8> @test_masked_i8_to_32_mask0(i8 %s, <32 x i8> %default, <32 x i8> %mask) {
; CHECK-LABEL: test_masked_i8_to_32_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k1
; CHECK-NEXT: vpbroadcastb %edi, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -144,8 +135,7 @@ define <32 x i8> @test_masked_i8_to_32_mask0(i8 %s, <32 x i8> %default, <32 x i8
define <32 x i8> @test_masked_z_i8_to_32_mask0(i8 %s, <32 x i8> %mask) {
; CHECK-LABEL: test_masked_z_i8_to_32_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k1
; CHECK-NEXT: vpbroadcastb %edi, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -157,8 +147,7 @@ define <32 x i8> @test_masked_z_i8_to_32_mask0(i8 %s, <32 x i8> %mask) {
define <32 x i8> @test_masked_i8_to_32_mask1(i8 %s, <32 x i8> %default, <32 x i8> %mask) {
; CHECK-LABEL: test_masked_i8_to_32_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k1
; CHECK-NEXT: vpbroadcastb %edi, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -171,8 +160,7 @@ define <32 x i8> @test_masked_i8_to_32_mask1(i8 %s, <32 x i8> %default, <32 x i8
define <32 x i8> @test_masked_z_i8_to_32_mask1(i8 %s, <32 x i8> %mask) {
; CHECK-LABEL: test_masked_z_i8_to_32_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k1
; CHECK-NEXT: vpbroadcastb %edi, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -184,8 +172,7 @@ define <32 x i8> @test_masked_z_i8_to_32_mask1(i8 %s, <32 x i8> %mask) {
define <32 x i8> @test_masked_i8_to_32_mask2(i8 %s, <32 x i8> %default, <32 x i8> %mask) {
; CHECK-LABEL: test_masked_i8_to_32_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k1
; CHECK-NEXT: vpbroadcastb %edi, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -198,8 +185,7 @@ define <32 x i8> @test_masked_i8_to_32_mask2(i8 %s, <32 x i8> %default, <32 x i8
define <32 x i8> @test_masked_z_i8_to_32_mask2(i8 %s, <32 x i8> %mask) {
; CHECK-LABEL: test_masked_z_i8_to_32_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k1
; CHECK-NEXT: vpbroadcastb %edi, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -211,8 +197,7 @@ define <32 x i8> @test_masked_z_i8_to_32_mask2(i8 %s, <32 x i8> %mask) {
define <32 x i8> @test_masked_i8_to_32_mask3(i8 %s, <32 x i8> %default, <32 x i8> %mask) {
; CHECK-LABEL: test_masked_i8_to_32_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k1
; CHECK-NEXT: vpbroadcastb %edi, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -225,8 +210,7 @@ define <32 x i8> @test_masked_i8_to_32_mask3(i8 %s, <32 x i8> %default, <32 x i8
define <32 x i8> @test_masked_z_i8_to_32_mask3(i8 %s, <32 x i8> %mask) {
; CHECK-LABEL: test_masked_z_i8_to_32_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k1
; CHECK-NEXT: vpbroadcastb %edi, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -247,8 +231,7 @@ define <64 x i8> @test_i8_to_64(i8 %s) {
define <64 x i8> @test_masked_i8_to_64_mask0(i8 %s, <64 x i8> %default, <64 x i8> %mask) {
; CHECK-LABEL: test_masked_i8_to_64_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmb %zmm1, %zmm1, %k1
; CHECK-NEXT: vpbroadcastb %edi, %zmm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -261,8 +244,7 @@ define <64 x i8> @test_masked_i8_to_64_mask0(i8 %s, <64 x i8> %default, <64 x i8
define <64 x i8> @test_masked_z_i8_to_64_mask0(i8 %s, <64 x i8> %mask) {
; CHECK-LABEL: test_masked_z_i8_to_64_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmb %zmm0, %zmm0, %k1
; CHECK-NEXT: vpbroadcastb %edi, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -274,8 +256,7 @@ define <64 x i8> @test_masked_z_i8_to_64_mask0(i8 %s, <64 x i8> %mask) {
define <64 x i8> @test_masked_i8_to_64_mask1(i8 %s, <64 x i8> %default, <64 x i8> %mask) {
; CHECK-LABEL: test_masked_i8_to_64_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmb %zmm1, %zmm1, %k1
; CHECK-NEXT: vpbroadcastb %edi, %zmm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -288,8 +269,7 @@ define <64 x i8> @test_masked_i8_to_64_mask1(i8 %s, <64 x i8> %default, <64 x i8
define <64 x i8> @test_masked_z_i8_to_64_mask1(i8 %s, <64 x i8> %mask) {
; CHECK-LABEL: test_masked_z_i8_to_64_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmb %zmm0, %zmm0, %k1
; CHECK-NEXT: vpbroadcastb %edi, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -301,8 +281,7 @@ define <64 x i8> @test_masked_z_i8_to_64_mask1(i8 %s, <64 x i8> %mask) {
define <64 x i8> @test_masked_i8_to_64_mask2(i8 %s, <64 x i8> %default, <64 x i8> %mask) {
; CHECK-LABEL: test_masked_i8_to_64_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmb %zmm1, %zmm1, %k1
; CHECK-NEXT: vpbroadcastb %edi, %zmm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -315,8 +294,7 @@ define <64 x i8> @test_masked_i8_to_64_mask2(i8 %s, <64 x i8> %default, <64 x i8
define <64 x i8> @test_masked_z_i8_to_64_mask2(i8 %s, <64 x i8> %mask) {
; CHECK-LABEL: test_masked_z_i8_to_64_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmb %zmm0, %zmm0, %k1
; CHECK-NEXT: vpbroadcastb %edi, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -328,8 +306,7 @@ define <64 x i8> @test_masked_z_i8_to_64_mask2(i8 %s, <64 x i8> %mask) {
define <64 x i8> @test_masked_i8_to_64_mask3(i8 %s, <64 x i8> %default, <64 x i8> %mask) {
; CHECK-LABEL: test_masked_i8_to_64_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmb %zmm1, %zmm1, %k1
; CHECK-NEXT: vpbroadcastb %edi, %zmm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -342,8 +319,7 @@ define <64 x i8> @test_masked_i8_to_64_mask3(i8 %s, <64 x i8> %default, <64 x i8
define <64 x i8> @test_masked_z_i8_to_64_mask3(i8 %s, <64 x i8> %mask) {
; CHECK-LABEL: test_masked_z_i8_to_64_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmb %zmm0, %zmm0, %k1
; CHECK-NEXT: vpbroadcastb %edi, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -364,8 +340,7 @@ define <8 x i16> @test_i16_to_8(i16 %s) {
define <8 x i16> @test_masked_i16_to_8_mask0(i16 %s, <8 x i16> %default, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_i16_to_8_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vpbroadcastw %edi, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -378,8 +353,7 @@ define <8 x i16> @test_masked_i16_to_8_mask0(i16 %s, <8 x i16> %default, <8 x i1
define <8 x i16> @test_masked_z_i16_to_8_mask0(i16 %s, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_i16_to_8_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vpbroadcastw %edi, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -391,8 +365,7 @@ define <8 x i16> @test_masked_z_i16_to_8_mask0(i16 %s, <8 x i16> %mask) {
define <8 x i16> @test_masked_i16_to_8_mask1(i16 %s, <8 x i16> %default, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_i16_to_8_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vpbroadcastw %edi, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -405,8 +378,7 @@ define <8 x i16> @test_masked_i16_to_8_mask1(i16 %s, <8 x i16> %default, <8 x i1
define <8 x i16> @test_masked_z_i16_to_8_mask1(i16 %s, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_i16_to_8_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vpbroadcastw %edi, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -418,8 +390,7 @@ define <8 x i16> @test_masked_z_i16_to_8_mask1(i16 %s, <8 x i16> %mask) {
define <8 x i16> @test_masked_i16_to_8_mask2(i16 %s, <8 x i16> %default, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_i16_to_8_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vpbroadcastw %edi, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -432,8 +403,7 @@ define <8 x i16> @test_masked_i16_to_8_mask2(i16 %s, <8 x i16> %default, <8 x i1
define <8 x i16> @test_masked_z_i16_to_8_mask2(i16 %s, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_i16_to_8_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vpbroadcastw %edi, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -445,8 +415,7 @@ define <8 x i16> @test_masked_z_i16_to_8_mask2(i16 %s, <8 x i16> %mask) {
define <8 x i16> @test_masked_i16_to_8_mask3(i16 %s, <8 x i16> %default, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_i16_to_8_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vpbroadcastw %edi, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -459,8 +428,7 @@ define <8 x i16> @test_masked_i16_to_8_mask3(i16 %s, <8 x i16> %default, <8 x i1
define <8 x i16> @test_masked_z_i16_to_8_mask3(i16 %s, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_i16_to_8_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vpbroadcastw %edi, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -481,8 +449,7 @@ define <16 x i16> @test_i16_to_16(i16 %s) {
define <16 x i16> @test_masked_i16_to_16_mask0(i16 %s, <16 x i16> %default, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_i16_to_16_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpbroadcastw %edi, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -495,8 +462,7 @@ define <16 x i16> @test_masked_i16_to_16_mask0(i16 %s, <16 x i16> %default, <16
define <16 x i16> @test_masked_z_i16_to_16_mask0(i16 %s, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_i16_to_16_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
; CHECK-NEXT: vpbroadcastw %edi, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -508,8 +474,7 @@ define <16 x i16> @test_masked_z_i16_to_16_mask0(i16 %s, <16 x i16> %mask) {
define <16 x i16> @test_masked_i16_to_16_mask1(i16 %s, <16 x i16> %default, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_i16_to_16_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpbroadcastw %edi, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -522,8 +487,7 @@ define <16 x i16> @test_masked_i16_to_16_mask1(i16 %s, <16 x i16> %default, <16
define <16 x i16> @test_masked_z_i16_to_16_mask1(i16 %s, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_i16_to_16_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
; CHECK-NEXT: vpbroadcastw %edi, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -535,8 +499,7 @@ define <16 x i16> @test_masked_z_i16_to_16_mask1(i16 %s, <16 x i16> %mask) {
define <16 x i16> @test_masked_i16_to_16_mask2(i16 %s, <16 x i16> %default, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_i16_to_16_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpbroadcastw %edi, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -549,8 +512,7 @@ define <16 x i16> @test_masked_i16_to_16_mask2(i16 %s, <16 x i16> %default, <16
define <16 x i16> @test_masked_z_i16_to_16_mask2(i16 %s, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_i16_to_16_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
; CHECK-NEXT: vpbroadcastw %edi, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -562,8 +524,7 @@ define <16 x i16> @test_masked_z_i16_to_16_mask2(i16 %s, <16 x i16> %mask) {
define <16 x i16> @test_masked_i16_to_16_mask3(i16 %s, <16 x i16> %default, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_i16_to_16_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpbroadcastw %edi, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -576,8 +537,7 @@ define <16 x i16> @test_masked_i16_to_16_mask3(i16 %s, <16 x i16> %default, <16
define <16 x i16> @test_masked_z_i16_to_16_mask3(i16 %s, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_i16_to_16_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
; CHECK-NEXT: vpbroadcastw %edi, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -598,8 +558,7 @@ define <32 x i16> @test_i16_to_32(i16 %s) {
define <32 x i16> @test_masked_i16_to_32_mask0(i16 %s, <32 x i16> %default, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_i16_to_32_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT: vpbroadcastw %edi, %zmm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -612,8 +571,7 @@ define <32 x i16> @test_masked_i16_to_32_mask0(i16 %s, <32 x i16> %default, <32
define <32 x i16> @test_masked_z_i16_to_32_mask0(i16 %s, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_z_i16_to_32_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1
; CHECK-NEXT: vpbroadcastw %edi, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -625,8 +583,7 @@ define <32 x i16> @test_masked_z_i16_to_32_mask0(i16 %s, <32 x i16> %mask) {
define <32 x i16> @test_masked_i16_to_32_mask1(i16 %s, <32 x i16> %default, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_i16_to_32_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT: vpbroadcastw %edi, %zmm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -639,8 +596,7 @@ define <32 x i16> @test_masked_i16_to_32_mask1(i16 %s, <32 x i16> %default, <32
define <32 x i16> @test_masked_z_i16_to_32_mask1(i16 %s, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_z_i16_to_32_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1
; CHECK-NEXT: vpbroadcastw %edi, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -652,8 +608,7 @@ define <32 x i16> @test_masked_z_i16_to_32_mask1(i16 %s, <32 x i16> %mask) {
define <32 x i16> @test_masked_i16_to_32_mask2(i16 %s, <32 x i16> %default, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_i16_to_32_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT: vpbroadcastw %edi, %zmm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -666,8 +621,7 @@ define <32 x i16> @test_masked_i16_to_32_mask2(i16 %s, <32 x i16> %default, <32
define <32 x i16> @test_masked_z_i16_to_32_mask2(i16 %s, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_z_i16_to_32_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1
; CHECK-NEXT: vpbroadcastw %edi, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -679,8 +633,7 @@ define <32 x i16> @test_masked_z_i16_to_32_mask2(i16 %s, <32 x i16> %mask) {
define <32 x i16> @test_masked_i16_to_32_mask3(i16 %s, <32 x i16> %default, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_i16_to_32_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT: vpbroadcastw %edi, %zmm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -693,8 +646,7 @@ define <32 x i16> @test_masked_i16_to_32_mask3(i16 %s, <32 x i16> %default, <32
define <32 x i16> @test_masked_z_i16_to_32_mask3(i16 %s, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_z_i16_to_32_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1
; CHECK-NEXT: vpbroadcastw %edi, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -715,8 +667,7 @@ define <4 x i32> @test_i32_to_4(i32 %s) {
define <4 x i32> @test_masked_i32_to_4_mask0(i32 %s, <4 x i32> %default, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_i32_to_4_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vpbroadcastd %edi, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -729,8 +680,7 @@ define <4 x i32> @test_masked_i32_to_4_mask0(i32 %s, <4 x i32> %default, <4 x i3
define <4 x i32> @test_masked_z_i32_to_4_mask0(i32 %s, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_i32_to_4_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vpbroadcastd %edi, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -742,8 +692,7 @@ define <4 x i32> @test_masked_z_i32_to_4_mask0(i32 %s, <4 x i32> %mask) {
define <4 x i32> @test_masked_i32_to_4_mask1(i32 %s, <4 x i32> %default, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_i32_to_4_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vpbroadcastd %edi, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -756,8 +705,7 @@ define <4 x i32> @test_masked_i32_to_4_mask1(i32 %s, <4 x i32> %default, <4 x i3
define <4 x i32> @test_masked_z_i32_to_4_mask1(i32 %s, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_i32_to_4_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vpbroadcastd %edi, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -769,8 +717,7 @@ define <4 x i32> @test_masked_z_i32_to_4_mask1(i32 %s, <4 x i32> %mask) {
define <4 x i32> @test_masked_i32_to_4_mask2(i32 %s, <4 x i32> %default, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_i32_to_4_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vpbroadcastd %edi, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -783,8 +730,7 @@ define <4 x i32> @test_masked_i32_to_4_mask2(i32 %s, <4 x i32> %default, <4 x i3
define <4 x i32> @test_masked_z_i32_to_4_mask2(i32 %s, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_i32_to_4_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vpbroadcastd %edi, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -796,8 +742,7 @@ define <4 x i32> @test_masked_z_i32_to_4_mask2(i32 %s, <4 x i32> %mask) {
define <4 x i32> @test_masked_i32_to_4_mask3(i32 %s, <4 x i32> %default, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_i32_to_4_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vpbroadcastd %edi, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -810,8 +755,7 @@ define <4 x i32> @test_masked_i32_to_4_mask3(i32 %s, <4 x i32> %default, <4 x i3
define <4 x i32> @test_masked_z_i32_to_4_mask3(i32 %s, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_i32_to_4_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vpbroadcastd %edi, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -832,8 +776,7 @@ define <8 x i32> @test_i32_to_8(i32 %s) {
define <8 x i32> @test_masked_i32_to_8_mask0(i32 %s, <8 x i32> %default, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_i32_to_8_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpbroadcastd %edi, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -846,8 +789,7 @@ define <8 x i32> @test_masked_i32_to_8_mask0(i32 %s, <8 x i32> %default, <8 x i3
define <8 x i32> @test_masked_z_i32_to_8_mask0(i32 %s, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_i32_to_8_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT: vpbroadcastd %edi, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -859,8 +801,7 @@ define <8 x i32> @test_masked_z_i32_to_8_mask0(i32 %s, <8 x i32> %mask) {
define <8 x i32> @test_masked_i32_to_8_mask1(i32 %s, <8 x i32> %default, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_i32_to_8_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpbroadcastd %edi, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -873,8 +814,7 @@ define <8 x i32> @test_masked_i32_to_8_mask1(i32 %s, <8 x i32> %default, <8 x i3
define <8 x i32> @test_masked_z_i32_to_8_mask1(i32 %s, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_i32_to_8_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT: vpbroadcastd %edi, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -886,8 +826,7 @@ define <8 x i32> @test_masked_z_i32_to_8_mask1(i32 %s, <8 x i32> %mask) {
define <8 x i32> @test_masked_i32_to_8_mask2(i32 %s, <8 x i32> %default, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_i32_to_8_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpbroadcastd %edi, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -900,8 +839,7 @@ define <8 x i32> @test_masked_i32_to_8_mask2(i32 %s, <8 x i32> %default, <8 x i3
define <8 x i32> @test_masked_z_i32_to_8_mask2(i32 %s, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_i32_to_8_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT: vpbroadcastd %edi, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -913,8 +851,7 @@ define <8 x i32> @test_masked_z_i32_to_8_mask2(i32 %s, <8 x i32> %mask) {
define <8 x i32> @test_masked_i32_to_8_mask3(i32 %s, <8 x i32> %default, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_i32_to_8_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpbroadcastd %edi, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -927,8 +864,7 @@ define <8 x i32> @test_masked_i32_to_8_mask3(i32 %s, <8 x i32> %default, <8 x i3
define <8 x i32> @test_masked_z_i32_to_8_mask3(i32 %s, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_i32_to_8_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT: vpbroadcastd %edi, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -949,8 +885,7 @@ define <16 x i32> @test_i32_to_16(i32 %s) {
define <16 x i32> @test_masked_i32_to_16_mask0(i32 %s, <16 x i32> %default, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_i32_to_16_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -963,8 +898,7 @@ define <16 x i32> @test_masked_i32_to_16_mask0(i32 %s, <16 x i32> %default, <16
define <16 x i32> @test_masked_z_i32_to_16_mask0(i32 %s, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_i32_to_16_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -976,8 +910,7 @@ define <16 x i32> @test_masked_z_i32_to_16_mask0(i32 %s, <16 x i32> %mask) {
define <16 x i32> @test_masked_i32_to_16_mask1(i32 %s, <16 x i32> %default, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_i32_to_16_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -990,8 +923,7 @@ define <16 x i32> @test_masked_i32_to_16_mask1(i32 %s, <16 x i32> %default, <16
define <16 x i32> @test_masked_z_i32_to_16_mask1(i32 %s, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_i32_to_16_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -1003,8 +935,7 @@ define <16 x i32> @test_masked_z_i32_to_16_mask1(i32 %s, <16 x i32> %mask) {
define <16 x i32> @test_masked_i32_to_16_mask2(i32 %s, <16 x i32> %default, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_i32_to_16_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -1017,8 +948,7 @@ define <16 x i32> @test_masked_i32_to_16_mask2(i32 %s, <16 x i32> %default, <16
define <16 x i32> @test_masked_z_i32_to_16_mask2(i32 %s, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_i32_to_16_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -1030,8 +960,7 @@ define <16 x i32> @test_masked_z_i32_to_16_mask2(i32 %s, <16 x i32> %mask) {
define <16 x i32> @test_masked_i32_to_16_mask3(i32 %s, <16 x i32> %default, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_i32_to_16_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -1044,8 +973,7 @@ define <16 x i32> @test_masked_i32_to_16_mask3(i32 %s, <16 x i32> %default, <16
define <16 x i32> @test_masked_z_i32_to_16_mask3(i32 %s, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_i32_to_16_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -1066,8 +994,7 @@ define <2 x i64> @test_i64_to_2(i64 %s) {
define <2 x i64> @test_masked_i64_to_2_mask0(i64 %s, <2 x i64> %default, <2 x i64> %mask) {
; CHECK-LABEL: test_masked_i64_to_2_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
; CHECK-NEXT: vpbroadcastq %rdi, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1080,8 +1007,7 @@ define <2 x i64> @test_masked_i64_to_2_mask0(i64 %s, <2 x i64> %default, <2 x i6
define <2 x i64> @test_masked_z_i64_to_2_mask0(i64 %s, <2 x i64> %mask) {
; CHECK-LABEL: test_masked_z_i64_to_2_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
; CHECK-NEXT: vpbroadcastq %rdi, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1093,8 +1019,7 @@ define <2 x i64> @test_masked_z_i64_to_2_mask0(i64 %s, <2 x i64> %mask) {
define <2 x i64> @test_masked_i64_to_2_mask1(i64 %s, <2 x i64> %default, <2 x i64> %mask) {
; CHECK-LABEL: test_masked_i64_to_2_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
; CHECK-NEXT: vpbroadcastq %rdi, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1107,8 +1032,7 @@ define <2 x i64> @test_masked_i64_to_2_mask1(i64 %s, <2 x i64> %default, <2 x i6
define <2 x i64> @test_masked_z_i64_to_2_mask1(i64 %s, <2 x i64> %mask) {
; CHECK-LABEL: test_masked_z_i64_to_2_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
; CHECK-NEXT: vpbroadcastq %rdi, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1129,8 +1053,7 @@ define <4 x i64> @test_i64_to_4(i64 %s) {
define <4 x i64> @test_masked_i64_to_4_mask0(i64 %s, <4 x i64> %default, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_i64_to_4_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vpbroadcastq %rdi, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1143,8 +1066,7 @@ define <4 x i64> @test_masked_i64_to_4_mask0(i64 %s, <4 x i64> %default, <4 x i6
define <4 x i64> @test_masked_z_i64_to_4_mask0(i64 %s, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_i64_to_4_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
; CHECK-NEXT: vpbroadcastq %rdi, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1156,8 +1078,7 @@ define <4 x i64> @test_masked_z_i64_to_4_mask0(i64 %s, <4 x i64> %mask) {
define <4 x i64> @test_masked_i64_to_4_mask1(i64 %s, <4 x i64> %default, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_i64_to_4_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vpbroadcastq %rdi, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1170,8 +1091,7 @@ define <4 x i64> @test_masked_i64_to_4_mask1(i64 %s, <4 x i64> %default, <4 x i6
define <4 x i64> @test_masked_z_i64_to_4_mask1(i64 %s, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_i64_to_4_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
; CHECK-NEXT: vpbroadcastq %rdi, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1183,8 +1103,7 @@ define <4 x i64> @test_masked_z_i64_to_4_mask1(i64 %s, <4 x i64> %mask) {
define <4 x i64> @test_masked_i64_to_4_mask2(i64 %s, <4 x i64> %default, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_i64_to_4_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vpbroadcastq %rdi, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1197,8 +1116,7 @@ define <4 x i64> @test_masked_i64_to_4_mask2(i64 %s, <4 x i64> %default, <4 x i6
define <4 x i64> @test_masked_z_i64_to_4_mask2(i64 %s, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_i64_to_4_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
; CHECK-NEXT: vpbroadcastq %rdi, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1210,8 +1128,7 @@ define <4 x i64> @test_masked_z_i64_to_4_mask2(i64 %s, <4 x i64> %mask) {
define <4 x i64> @test_masked_i64_to_4_mask3(i64 %s, <4 x i64> %default, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_i64_to_4_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vpbroadcastq %rdi, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1224,8 +1141,7 @@ define <4 x i64> @test_masked_i64_to_4_mask3(i64 %s, <4 x i64> %default, <4 x i6
define <4 x i64> @test_masked_z_i64_to_4_mask3(i64 %s, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_i64_to_4_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
; CHECK-NEXT: vpbroadcastq %rdi, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1246,8 +1162,7 @@ define <8 x i64> @test_i64_to_8(i64 %s) {
define <8 x i64> @test_masked_i64_to_8_mask0(i64 %s, <8 x i64> %default, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_i64_to_8_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1260,8 +1175,7 @@ define <8 x i64> @test_masked_i64_to_8_mask0(i64 %s, <8 x i64> %default, <8 x i6
define <8 x i64> @test_masked_z_i64_to_8_mask0(i64 %s, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_i64_to_8_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1273,8 +1187,7 @@ define <8 x i64> @test_masked_z_i64_to_8_mask0(i64 %s, <8 x i64> %mask) {
define <8 x i64> @test_masked_i64_to_8_mask1(i64 %s, <8 x i64> %default, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_i64_to_8_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1287,8 +1200,7 @@ define <8 x i64> @test_masked_i64_to_8_mask1(i64 %s, <8 x i64> %default, <8 x i6
define <8 x i64> @test_masked_z_i64_to_8_mask1(i64 %s, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_i64_to_8_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1300,8 +1212,7 @@ define <8 x i64> @test_masked_z_i64_to_8_mask1(i64 %s, <8 x i64> %mask) {
define <8 x i64> @test_masked_i64_to_8_mask2(i64 %s, <8 x i64> %default, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_i64_to_8_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1314,8 +1225,7 @@ define <8 x i64> @test_masked_i64_to_8_mask2(i64 %s, <8 x i64> %default, <8 x i6
define <8 x i64> @test_masked_z_i64_to_8_mask2(i64 %s, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_i64_to_8_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1327,8 +1237,7 @@ define <8 x i64> @test_masked_z_i64_to_8_mask2(i64 %s, <8 x i64> %mask) {
define <8 x i64> @test_masked_i64_to_8_mask3(i64 %s, <8 x i64> %default, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_i64_to_8_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1}
; CHECK-NEXT: retq
%vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1341,8 +1250,7 @@ define <8 x i64> @test_masked_i64_to_8_mask3(i64 %s, <8 x i64> %default, <8 x i6
define <8 x i64> @test_masked_z_i64_to_8_mask3(i64 %s, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_i64_to_8_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1364,8 +1272,7 @@ define <16 x i8> @test_i8_to_16_mem(i8* %p) {
define <16 x i8> @test_masked_i8_to_16_mem_mask0(i8* %p, <16 x i8> %default, <16 x i8> %mask) {
; CHECK-LABEL: test_masked_i8_to_16_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmb %xmm1, %xmm1, %k1
; CHECK-NEXT: vpbroadcastb (%rdi), %xmm0 {%k1}
; CHECK-NEXT: retq
%s = load i8, i8* %p
@@ -1379,8 +1286,7 @@ define <16 x i8> @test_masked_i8_to_16_mem_mask0(i8* %p, <16 x i8> %default, <16
define <16 x i8> @test_masked_z_i8_to_16_mem_mask0(i8* %p, <16 x i8> %mask) {
; CHECK-LABEL: test_masked_z_i8_to_16_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmb %xmm0, %xmm0, %k1
; CHECK-NEXT: vpbroadcastb (%rdi), %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i8, i8* %p
@@ -1393,8 +1299,7 @@ define <16 x i8> @test_masked_z_i8_to_16_mem_mask0(i8* %p, <16 x i8> %mask) {
define <16 x i8> @test_masked_i8_to_16_mem_mask1(i8* %p, <16 x i8> %default, <16 x i8> %mask) {
; CHECK-LABEL: test_masked_i8_to_16_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmb %xmm1, %xmm1, %k1
; CHECK-NEXT: vpbroadcastb (%rdi), %xmm0 {%k1}
; CHECK-NEXT: retq
%s = load i8, i8* %p
@@ -1408,8 +1313,7 @@ define <16 x i8> @test_masked_i8_to_16_mem_mask1(i8* %p, <16 x i8> %default, <16
define <16 x i8> @test_masked_z_i8_to_16_mem_mask1(i8* %p, <16 x i8> %mask) {
; CHECK-LABEL: test_masked_z_i8_to_16_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmb %xmm0, %xmm0, %k1
; CHECK-NEXT: vpbroadcastb (%rdi), %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i8, i8* %p
@@ -1422,8 +1326,7 @@ define <16 x i8> @test_masked_z_i8_to_16_mem_mask1(i8* %p, <16 x i8> %mask) {
define <16 x i8> @test_masked_i8_to_16_mem_mask2(i8* %p, <16 x i8> %default, <16 x i8> %mask) {
; CHECK-LABEL: test_masked_i8_to_16_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmb %xmm1, %xmm1, %k1
; CHECK-NEXT: vpbroadcastb (%rdi), %xmm0 {%k1}
; CHECK-NEXT: retq
%s = load i8, i8* %p
@@ -1437,8 +1340,7 @@ define <16 x i8> @test_masked_i8_to_16_mem_mask2(i8* %p, <16 x i8> %default, <16
define <16 x i8> @test_masked_z_i8_to_16_mem_mask2(i8* %p, <16 x i8> %mask) {
; CHECK-LABEL: test_masked_z_i8_to_16_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmb %xmm0, %xmm0, %k1
; CHECK-NEXT: vpbroadcastb (%rdi), %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i8, i8* %p
@@ -1451,8 +1353,7 @@ define <16 x i8> @test_masked_z_i8_to_16_mem_mask2(i8* %p, <16 x i8> %mask) {
define <16 x i8> @test_masked_i8_to_16_mem_mask3(i8* %p, <16 x i8> %default, <16 x i8> %mask) {
; CHECK-LABEL: test_masked_i8_to_16_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmb %xmm1, %xmm1, %k1
; CHECK-NEXT: vpbroadcastb (%rdi), %xmm0 {%k1}
; CHECK-NEXT: retq
%s = load i8, i8* %p
@@ -1466,8 +1367,7 @@ define <16 x i8> @test_masked_i8_to_16_mem_mask3(i8* %p, <16 x i8> %default, <16
define <16 x i8> @test_masked_z_i8_to_16_mem_mask3(i8* %p, <16 x i8> %mask) {
; CHECK-LABEL: test_masked_z_i8_to_16_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmb %xmm0, %xmm0, %k1
; CHECK-NEXT: vpbroadcastb (%rdi), %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i8, i8* %p
@@ -1490,8 +1390,7 @@ define <32 x i8> @test_i8_to_32_mem(i8* %p) {
define <32 x i8> @test_masked_i8_to_32_mem_mask0(i8* %p, <32 x i8> %default, <32 x i8> %mask) {
; CHECK-LABEL: test_masked_i8_to_32_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k1
; CHECK-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1}
; CHECK-NEXT: retq
%s = load i8, i8* %p
@@ -1505,8 +1404,7 @@ define <32 x i8> @test_masked_i8_to_32_mem_mask0(i8* %p, <32 x i8> %default, <32
define <32 x i8> @test_masked_z_i8_to_32_mem_mask0(i8* %p, <32 x i8> %mask) {
; CHECK-LABEL: test_masked_z_i8_to_32_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k1
; CHECK-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i8, i8* %p
@@ -1519,8 +1417,7 @@ define <32 x i8> @test_masked_z_i8_to_32_mem_mask0(i8* %p, <32 x i8> %mask) {
define <32 x i8> @test_masked_i8_to_32_mem_mask1(i8* %p, <32 x i8> %default, <32 x i8> %mask) {
; CHECK-LABEL: test_masked_i8_to_32_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k1
; CHECK-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1}
; CHECK-NEXT: retq
%s = load i8, i8* %p
@@ -1534,8 +1431,7 @@ define <32 x i8> @test_masked_i8_to_32_mem_mask1(i8* %p, <32 x i8> %default, <32
define <32 x i8> @test_masked_z_i8_to_32_mem_mask1(i8* %p, <32 x i8> %mask) {
; CHECK-LABEL: test_masked_z_i8_to_32_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k1
; CHECK-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i8, i8* %p
@@ -1548,8 +1444,7 @@ define <32 x i8> @test_masked_z_i8_to_32_mem_mask1(i8* %p, <32 x i8> %mask) {
define <32 x i8> @test_masked_i8_to_32_mem_mask2(i8* %p, <32 x i8> %default, <32 x i8> %mask) {
; CHECK-LABEL: test_masked_i8_to_32_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k1
; CHECK-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1}
; CHECK-NEXT: retq
%s = load i8, i8* %p
@@ -1563,8 +1458,7 @@ define <32 x i8> @test_masked_i8_to_32_mem_mask2(i8* %p, <32 x i8> %default, <32
define <32 x i8> @test_masked_z_i8_to_32_mem_mask2(i8* %p, <32 x i8> %mask) {
; CHECK-LABEL: test_masked_z_i8_to_32_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k1
; CHECK-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i8, i8* %p
@@ -1577,8 +1471,7 @@ define <32 x i8> @test_masked_z_i8_to_32_mem_mask2(i8* %p, <32 x i8> %mask) {
define <32 x i8> @test_masked_i8_to_32_mem_mask3(i8* %p, <32 x i8> %default, <32 x i8> %mask) {
; CHECK-LABEL: test_masked_i8_to_32_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k1
; CHECK-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1}
; CHECK-NEXT: retq
%s = load i8, i8* %p
@@ -1592,8 +1485,7 @@ define <32 x i8> @test_masked_i8_to_32_mem_mask3(i8* %p, <32 x i8> %default, <32
define <32 x i8> @test_masked_z_i8_to_32_mem_mask3(i8* %p, <32 x i8> %mask) {
; CHECK-LABEL: test_masked_z_i8_to_32_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k1
; CHECK-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i8, i8* %p
@@ -1616,8 +1508,7 @@ define <64 x i8> @test_i8_to_64_mem(i8* %p) {
define <64 x i8> @test_masked_i8_to_64_mem_mask0(i8* %p, <64 x i8> %default, <64 x i8> %mask) {
; CHECK-LABEL: test_masked_i8_to_64_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmb %zmm1, %zmm1, %k1
; CHECK-NEXT: vpbroadcastb (%rdi), %zmm0 {%k1}
; CHECK-NEXT: retq
%s = load i8, i8* %p
@@ -1631,8 +1522,7 @@ define <64 x i8> @test_masked_i8_to_64_mem_mask0(i8* %p, <64 x i8> %default, <64
define <64 x i8> @test_masked_z_i8_to_64_mem_mask0(i8* %p, <64 x i8> %mask) {
; CHECK-LABEL: test_masked_z_i8_to_64_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmb %zmm0, %zmm0, %k1
; CHECK-NEXT: vpbroadcastb (%rdi), %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i8, i8* %p
@@ -1645,8 +1535,7 @@ define <64 x i8> @test_masked_z_i8_to_64_mem_mask0(i8* %p, <64 x i8> %mask) {
define <64 x i8> @test_masked_i8_to_64_mem_mask1(i8* %p, <64 x i8> %default, <64 x i8> %mask) {
; CHECK-LABEL: test_masked_i8_to_64_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmb %zmm1, %zmm1, %k1
; CHECK-NEXT: vpbroadcastb (%rdi), %zmm0 {%k1}
; CHECK-NEXT: retq
%s = load i8, i8* %p
@@ -1660,8 +1549,7 @@ define <64 x i8> @test_masked_i8_to_64_mem_mask1(i8* %p, <64 x i8> %default, <64
define <64 x i8> @test_masked_z_i8_to_64_mem_mask1(i8* %p, <64 x i8> %mask) {
; CHECK-LABEL: test_masked_z_i8_to_64_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmb %zmm0, %zmm0, %k1
; CHECK-NEXT: vpbroadcastb (%rdi), %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i8, i8* %p
@@ -1674,8 +1562,7 @@ define <64 x i8> @test_masked_z_i8_to_64_mem_mask1(i8* %p, <64 x i8> %mask) {
define <64 x i8> @test_masked_i8_to_64_mem_mask2(i8* %p, <64 x i8> %default, <64 x i8> %mask) {
; CHECK-LABEL: test_masked_i8_to_64_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmb %zmm1, %zmm1, %k1
; CHECK-NEXT: vpbroadcastb (%rdi), %zmm0 {%k1}
; CHECK-NEXT: retq
%s = load i8, i8* %p
@@ -1689,8 +1576,7 @@ define <64 x i8> @test_masked_i8_to_64_mem_mask2(i8* %p, <64 x i8> %default, <64
define <64 x i8> @test_masked_z_i8_to_64_mem_mask2(i8* %p, <64 x i8> %mask) {
; CHECK-LABEL: test_masked_z_i8_to_64_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmb %zmm0, %zmm0, %k1
; CHECK-NEXT: vpbroadcastb (%rdi), %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i8, i8* %p
@@ -1703,8 +1589,7 @@ define <64 x i8> @test_masked_z_i8_to_64_mem_mask2(i8* %p, <64 x i8> %mask) {
define <64 x i8> @test_masked_i8_to_64_mem_mask3(i8* %p, <64 x i8> %default, <64 x i8> %mask) {
; CHECK-LABEL: test_masked_i8_to_64_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmb %zmm1, %zmm1, %k1
; CHECK-NEXT: vpbroadcastb (%rdi), %zmm0 {%k1}
; CHECK-NEXT: retq
%s = load i8, i8* %p
@@ -1718,8 +1603,7 @@ define <64 x i8> @test_masked_i8_to_64_mem_mask3(i8* %p, <64 x i8> %default, <64
define <64 x i8> @test_masked_z_i8_to_64_mem_mask3(i8* %p, <64 x i8> %mask) {
; CHECK-LABEL: test_masked_z_i8_to_64_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmb %zmm0, %zmm0, %k1
; CHECK-NEXT: vpbroadcastb (%rdi), %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i8, i8* %p
@@ -1742,8 +1626,7 @@ define <8 x i16> @test_i16_to_8_mem(i16* %p) {
define <8 x i16> @test_masked_i16_to_8_mem_mask0(i16* %p, <8 x i16> %default, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_i16_to_8_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vpbroadcastw (%rdi), %xmm0 {%k1}
; CHECK-NEXT: retq
%s = load i16, i16* %p
@@ -1757,8 +1640,7 @@ define <8 x i16> @test_masked_i16_to_8_mem_mask0(i16* %p, <8 x i16> %default, <8
define <8 x i16> @test_masked_z_i16_to_8_mem_mask0(i16* %p, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_i16_to_8_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vpbroadcastw (%rdi), %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i16, i16* %p
@@ -1771,8 +1653,7 @@ define <8 x i16> @test_masked_z_i16_to_8_mem_mask0(i16* %p, <8 x i16> %mask) {
define <8 x i16> @test_masked_i16_to_8_mem_mask1(i16* %p, <8 x i16> %default, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_i16_to_8_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vpbroadcastw (%rdi), %xmm0 {%k1}
; CHECK-NEXT: retq
%s = load i16, i16* %p
@@ -1786,8 +1667,7 @@ define <8 x i16> @test_masked_i16_to_8_mem_mask1(i16* %p, <8 x i16> %default, <8
define <8 x i16> @test_masked_z_i16_to_8_mem_mask1(i16* %p, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_i16_to_8_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vpbroadcastw (%rdi), %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i16, i16* %p
@@ -1800,8 +1680,7 @@ define <8 x i16> @test_masked_z_i16_to_8_mem_mask1(i16* %p, <8 x i16> %mask) {
define <8 x i16> @test_masked_i16_to_8_mem_mask2(i16* %p, <8 x i16> %default, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_i16_to_8_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vpbroadcastw (%rdi), %xmm0 {%k1}
; CHECK-NEXT: retq
%s = load i16, i16* %p
@@ -1815,8 +1694,7 @@ define <8 x i16> @test_masked_i16_to_8_mem_mask2(i16* %p, <8 x i16> %default, <8
define <8 x i16> @test_masked_z_i16_to_8_mem_mask2(i16* %p, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_i16_to_8_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vpbroadcastw (%rdi), %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i16, i16* %p
@@ -1829,8 +1707,7 @@ define <8 x i16> @test_masked_z_i16_to_8_mem_mask2(i16* %p, <8 x i16> %mask) {
define <8 x i16> @test_masked_i16_to_8_mem_mask3(i16* %p, <8 x i16> %default, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_i16_to_8_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vpbroadcastw (%rdi), %xmm0 {%k1}
; CHECK-NEXT: retq
%s = load i16, i16* %p
@@ -1844,8 +1721,7 @@ define <8 x i16> @test_masked_i16_to_8_mem_mask3(i16* %p, <8 x i16> %default, <8
define <8 x i16> @test_masked_z_i16_to_8_mem_mask3(i16* %p, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_i16_to_8_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vpbroadcastw (%rdi), %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i16, i16* %p
@@ -1868,8 +1744,7 @@ define <16 x i16> @test_i16_to_16_mem(i16* %p) {
define <16 x i16> @test_masked_i16_to_16_mem_mask0(i16* %p, <16 x i16> %default, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_i16_to_16_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpbroadcastw (%rdi), %ymm0 {%k1}
; CHECK-NEXT: retq
%s = load i16, i16* %p
@@ -1883,8 +1758,7 @@ define <16 x i16> @test_masked_i16_to_16_mem_mask0(i16* %p, <16 x i16> %default,
define <16 x i16> @test_masked_z_i16_to_16_mem_mask0(i16* %p, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_i16_to_16_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
; CHECK-NEXT: vpbroadcastw (%rdi), %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i16, i16* %p
@@ -1897,8 +1771,7 @@ define <16 x i16> @test_masked_z_i16_to_16_mem_mask0(i16* %p, <16 x i16> %mask)
define <16 x i16> @test_masked_i16_to_16_mem_mask1(i16* %p, <16 x i16> %default, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_i16_to_16_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpbroadcastw (%rdi), %ymm0 {%k1}
; CHECK-NEXT: retq
%s = load i16, i16* %p
@@ -1912,8 +1785,7 @@ define <16 x i16> @test_masked_i16_to_16_mem_mask1(i16* %p, <16 x i16> %default,
define <16 x i16> @test_masked_z_i16_to_16_mem_mask1(i16* %p, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_i16_to_16_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
; CHECK-NEXT: vpbroadcastw (%rdi), %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i16, i16* %p
@@ -1926,8 +1798,7 @@ define <16 x i16> @test_masked_z_i16_to_16_mem_mask1(i16* %p, <16 x i16> %mask)
define <16 x i16> @test_masked_i16_to_16_mem_mask2(i16* %p, <16 x i16> %default, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_i16_to_16_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpbroadcastw (%rdi), %ymm0 {%k1}
; CHECK-NEXT: retq
%s = load i16, i16* %p
@@ -1941,8 +1812,7 @@ define <16 x i16> @test_masked_i16_to_16_mem_mask2(i16* %p, <16 x i16> %default,
define <16 x i16> @test_masked_z_i16_to_16_mem_mask2(i16* %p, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_i16_to_16_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
; CHECK-NEXT: vpbroadcastw (%rdi), %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i16, i16* %p
@@ -1955,8 +1825,7 @@ define <16 x i16> @test_masked_z_i16_to_16_mem_mask2(i16* %p, <16 x i16> %mask)
define <16 x i16> @test_masked_i16_to_16_mem_mask3(i16* %p, <16 x i16> %default, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_i16_to_16_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpbroadcastw (%rdi), %ymm0 {%k1}
; CHECK-NEXT: retq
%s = load i16, i16* %p
@@ -1970,8 +1839,7 @@ define <16 x i16> @test_masked_i16_to_16_mem_mask3(i16* %p, <16 x i16> %default,
define <16 x i16> @test_masked_z_i16_to_16_mem_mask3(i16* %p, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_i16_to_16_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
; CHECK-NEXT: vpbroadcastw (%rdi), %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i16, i16* %p
@@ -1994,8 +1862,7 @@ define <32 x i16> @test_i16_to_32_mem(i16* %p) {
define <32 x i16> @test_masked_i16_to_32_mem_mask0(i16* %p, <32 x i16> %default, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_i16_to_32_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT: vpbroadcastw (%rdi), %zmm0 {%k1}
; CHECK-NEXT: retq
%s = load i16, i16* %p
@@ -2009,8 +1876,7 @@ define <32 x i16> @test_masked_i16_to_32_mem_mask0(i16* %p, <32 x i16> %default,
define <32 x i16> @test_masked_z_i16_to_32_mem_mask0(i16* %p, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_z_i16_to_32_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1
; CHECK-NEXT: vpbroadcastw (%rdi), %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i16, i16* %p
@@ -2023,8 +1889,7 @@ define <32 x i16> @test_masked_z_i16_to_32_mem_mask0(i16* %p, <32 x i16> %mask)
define <32 x i16> @test_masked_i16_to_32_mem_mask1(i16* %p, <32 x i16> %default, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_i16_to_32_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT: vpbroadcastw (%rdi), %zmm0 {%k1}
; CHECK-NEXT: retq
%s = load i16, i16* %p
@@ -2038,8 +1903,7 @@ define <32 x i16> @test_masked_i16_to_32_mem_mask1(i16* %p, <32 x i16> %default,
define <32 x i16> @test_masked_z_i16_to_32_mem_mask1(i16* %p, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_z_i16_to_32_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1
; CHECK-NEXT: vpbroadcastw (%rdi), %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i16, i16* %p
@@ -2052,8 +1916,7 @@ define <32 x i16> @test_masked_z_i16_to_32_mem_mask1(i16* %p, <32 x i16> %mask)
define <32 x i16> @test_masked_i16_to_32_mem_mask2(i16* %p, <32 x i16> %default, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_i16_to_32_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT: vpbroadcastw (%rdi), %zmm0 {%k1}
; CHECK-NEXT: retq
%s = load i16, i16* %p
@@ -2067,8 +1930,7 @@ define <32 x i16> @test_masked_i16_to_32_mem_mask2(i16* %p, <32 x i16> %default,
define <32 x i16> @test_masked_z_i16_to_32_mem_mask2(i16* %p, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_z_i16_to_32_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1
; CHECK-NEXT: vpbroadcastw (%rdi), %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i16, i16* %p
@@ -2081,8 +1943,7 @@ define <32 x i16> @test_masked_z_i16_to_32_mem_mask2(i16* %p, <32 x i16> %mask)
define <32 x i16> @test_masked_i16_to_32_mem_mask3(i16* %p, <32 x i16> %default, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_i16_to_32_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT: vpbroadcastw (%rdi), %zmm0 {%k1}
; CHECK-NEXT: retq
%s = load i16, i16* %p
@@ -2096,8 +1957,7 @@ define <32 x i16> @test_masked_i16_to_32_mem_mask3(i16* %p, <32 x i16> %default,
define <32 x i16> @test_masked_z_i16_to_32_mem_mask3(i16* %p, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_z_i16_to_32_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1
; CHECK-NEXT: vpbroadcastw (%rdi), %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i16, i16* %p
@@ -2120,8 +1980,7 @@ define <4 x i32> @test_i32_to_4_mem(i32* %p) {
define <4 x i32> @test_masked_i32_to_4_mem_mask0(i32* %p, <4 x i32> %default, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_i32_to_4_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vpbroadcastd (%rdi), %xmm0 {%k1}
; CHECK-NEXT: retq
%s = load i32, i32* %p
@@ -2135,8 +1994,7 @@ define <4 x i32> @test_masked_i32_to_4_mem_mask0(i32* %p, <4 x i32> %default, <4
define <4 x i32> @test_masked_z_i32_to_4_mem_mask0(i32* %p, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_i32_to_4_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vpbroadcastd (%rdi), %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i32, i32* %p
@@ -2149,8 +2007,7 @@ define <4 x i32> @test_masked_z_i32_to_4_mem_mask0(i32* %p, <4 x i32> %mask) {
define <4 x i32> @test_masked_i32_to_4_mem_mask1(i32* %p, <4 x i32> %default, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_i32_to_4_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vpbroadcastd (%rdi), %xmm0 {%k1}
; CHECK-NEXT: retq
%s = load i32, i32* %p
@@ -2164,8 +2021,7 @@ define <4 x i32> @test_masked_i32_to_4_mem_mask1(i32* %p, <4 x i32> %default, <4
define <4 x i32> @test_masked_z_i32_to_4_mem_mask1(i32* %p, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_i32_to_4_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vpbroadcastd (%rdi), %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i32, i32* %p
@@ -2178,8 +2034,7 @@ define <4 x i32> @test_masked_z_i32_to_4_mem_mask1(i32* %p, <4 x i32> %mask) {
define <4 x i32> @test_masked_i32_to_4_mem_mask2(i32* %p, <4 x i32> %default, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_i32_to_4_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vpbroadcastd (%rdi), %xmm0 {%k1}
; CHECK-NEXT: retq
%s = load i32, i32* %p
@@ -2193,8 +2048,7 @@ define <4 x i32> @test_masked_i32_to_4_mem_mask2(i32* %p, <4 x i32> %default, <4
define <4 x i32> @test_masked_z_i32_to_4_mem_mask2(i32* %p, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_i32_to_4_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vpbroadcastd (%rdi), %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i32, i32* %p
@@ -2207,8 +2061,7 @@ define <4 x i32> @test_masked_z_i32_to_4_mem_mask2(i32* %p, <4 x i32> %mask) {
define <4 x i32> @test_masked_i32_to_4_mem_mask3(i32* %p, <4 x i32> %default, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_i32_to_4_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vpbroadcastd (%rdi), %xmm0 {%k1}
; CHECK-NEXT: retq
%s = load i32, i32* %p
@@ -2222,8 +2075,7 @@ define <4 x i32> @test_masked_i32_to_4_mem_mask3(i32* %p, <4 x i32> %default, <4
define <4 x i32> @test_masked_z_i32_to_4_mem_mask3(i32* %p, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_i32_to_4_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vpbroadcastd (%rdi), %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i32, i32* %p
@@ -2246,8 +2098,7 @@ define <8 x i32> @test_i32_to_8_mem(i32* %p) {
define <8 x i32> @test_masked_i32_to_8_mem_mask0(i32* %p, <8 x i32> %default, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_i32_to_8_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpbroadcastd (%rdi), %ymm0 {%k1}
; CHECK-NEXT: retq
%s = load i32, i32* %p
@@ -2261,8 +2112,7 @@ define <8 x i32> @test_masked_i32_to_8_mem_mask0(i32* %p, <8 x i32> %default, <8
define <8 x i32> @test_masked_z_i32_to_8_mem_mask0(i32* %p, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_i32_to_8_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT: vpbroadcastd (%rdi), %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i32, i32* %p
@@ -2275,8 +2125,7 @@ define <8 x i32> @test_masked_z_i32_to_8_mem_mask0(i32* %p, <8 x i32> %mask) {
define <8 x i32> @test_masked_i32_to_8_mem_mask1(i32* %p, <8 x i32> %default, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_i32_to_8_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpbroadcastd (%rdi), %ymm0 {%k1}
; CHECK-NEXT: retq
%s = load i32, i32* %p
@@ -2290,8 +2139,7 @@ define <8 x i32> @test_masked_i32_to_8_mem_mask1(i32* %p, <8 x i32> %default, <8
define <8 x i32> @test_masked_z_i32_to_8_mem_mask1(i32* %p, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_i32_to_8_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT: vpbroadcastd (%rdi), %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i32, i32* %p
@@ -2304,8 +2152,7 @@ define <8 x i32> @test_masked_z_i32_to_8_mem_mask1(i32* %p, <8 x i32> %mask) {
define <8 x i32> @test_masked_i32_to_8_mem_mask2(i32* %p, <8 x i32> %default, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_i32_to_8_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpbroadcastd (%rdi), %ymm0 {%k1}
; CHECK-NEXT: retq
%s = load i32, i32* %p
@@ -2319,8 +2166,7 @@ define <8 x i32> @test_masked_i32_to_8_mem_mask2(i32* %p, <8 x i32> %default, <8
define <8 x i32> @test_masked_z_i32_to_8_mem_mask2(i32* %p, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_i32_to_8_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT: vpbroadcastd (%rdi), %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i32, i32* %p
@@ -2333,8 +2179,7 @@ define <8 x i32> @test_masked_z_i32_to_8_mem_mask2(i32* %p, <8 x i32> %mask) {
define <8 x i32> @test_masked_i32_to_8_mem_mask3(i32* %p, <8 x i32> %default, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_i32_to_8_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpbroadcastd (%rdi), %ymm0 {%k1}
; CHECK-NEXT: retq
%s = load i32, i32* %p
@@ -2348,8 +2193,7 @@ define <8 x i32> @test_masked_i32_to_8_mem_mask3(i32* %p, <8 x i32> %default, <8
define <8 x i32> @test_masked_z_i32_to_8_mem_mask3(i32* %p, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_i32_to_8_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT: vpbroadcastd (%rdi), %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i32, i32* %p
@@ -2372,8 +2216,7 @@ define <16 x i32> @test_i32_to_16_mem(i32* %p) {
define <16 x i32> @test_masked_i32_to_16_mem_mask0(i32* %p, <16 x i32> %default, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_i32_to_16_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vpbroadcastd (%rdi), %zmm0 {%k1}
; CHECK-NEXT: retq
%s = load i32, i32* %p
@@ -2387,8 +2230,7 @@ define <16 x i32> @test_masked_i32_to_16_mem_mask0(i32* %p, <16 x i32> %default,
define <16 x i32> @test_masked_z_i32_to_16_mem_mask0(i32* %p, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_i32_to_16_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
; CHECK-NEXT: vpbroadcastd (%rdi), %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i32, i32* %p
@@ -2401,8 +2243,7 @@ define <16 x i32> @test_masked_z_i32_to_16_mem_mask0(i32* %p, <16 x i32> %mask)
define <16 x i32> @test_masked_i32_to_16_mem_mask1(i32* %p, <16 x i32> %default, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_i32_to_16_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vpbroadcastd (%rdi), %zmm0 {%k1}
; CHECK-NEXT: retq
%s = load i32, i32* %p
@@ -2416,8 +2257,7 @@ define <16 x i32> @test_masked_i32_to_16_mem_mask1(i32* %p, <16 x i32> %default,
define <16 x i32> @test_masked_z_i32_to_16_mem_mask1(i32* %p, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_i32_to_16_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
; CHECK-NEXT: vpbroadcastd (%rdi), %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i32, i32* %p
@@ -2430,8 +2270,7 @@ define <16 x i32> @test_masked_z_i32_to_16_mem_mask1(i32* %p, <16 x i32> %mask)
define <16 x i32> @test_masked_i32_to_16_mem_mask2(i32* %p, <16 x i32> %default, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_i32_to_16_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vpbroadcastd (%rdi), %zmm0 {%k1}
; CHECK-NEXT: retq
%s = load i32, i32* %p
@@ -2445,8 +2284,7 @@ define <16 x i32> @test_masked_i32_to_16_mem_mask2(i32* %p, <16 x i32> %default,
define <16 x i32> @test_masked_z_i32_to_16_mem_mask2(i32* %p, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_i32_to_16_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
; CHECK-NEXT: vpbroadcastd (%rdi), %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i32, i32* %p
@@ -2459,8 +2297,7 @@ define <16 x i32> @test_masked_z_i32_to_16_mem_mask2(i32* %p, <16 x i32> %mask)
define <16 x i32> @test_masked_i32_to_16_mem_mask3(i32* %p, <16 x i32> %default, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_i32_to_16_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vpbroadcastd (%rdi), %zmm0 {%k1}
; CHECK-NEXT: retq
%s = load i32, i32* %p
@@ -2474,8 +2311,7 @@ define <16 x i32> @test_masked_i32_to_16_mem_mask3(i32* %p, <16 x i32> %default,
define <16 x i32> @test_masked_z_i32_to_16_mem_mask3(i32* %p, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_i32_to_16_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
; CHECK-NEXT: vpbroadcastd (%rdi), %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i32, i32* %p
@@ -2498,8 +2334,7 @@ define <2 x i64> @test_i64_to_2_mem(i64* %p) {
define <2 x i64> @test_masked_i64_to_2_mem_mask0(i64* %p, <2 x i64> %default, <2 x i64> %mask) {
; CHECK-LABEL: test_masked_i64_to_2_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
; CHECK-NEXT: vpbroadcastq (%rdi), %xmm0 {%k1}
; CHECK-NEXT: retq
%s = load i64, i64* %p
@@ -2513,8 +2348,7 @@ define <2 x i64> @test_masked_i64_to_2_mem_mask0(i64* %p, <2 x i64> %default, <2
define <2 x i64> @test_masked_z_i64_to_2_mem_mask0(i64* %p, <2 x i64> %mask) {
; CHECK-LABEL: test_masked_z_i64_to_2_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
; CHECK-NEXT: vpbroadcastq (%rdi), %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i64, i64* %p
@@ -2527,8 +2361,7 @@ define <2 x i64> @test_masked_z_i64_to_2_mem_mask0(i64* %p, <2 x i64> %mask) {
define <2 x i64> @test_masked_i64_to_2_mem_mask1(i64* %p, <2 x i64> %default, <2 x i64> %mask) {
; CHECK-LABEL: test_masked_i64_to_2_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
; CHECK-NEXT: vpbroadcastq (%rdi), %xmm0 {%k1}
; CHECK-NEXT: retq
%s = load i64, i64* %p
@@ -2542,8 +2375,7 @@ define <2 x i64> @test_masked_i64_to_2_mem_mask1(i64* %p, <2 x i64> %default, <2
define <2 x i64> @test_masked_z_i64_to_2_mem_mask1(i64* %p, <2 x i64> %mask) {
; CHECK-LABEL: test_masked_z_i64_to_2_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
; CHECK-NEXT: vpbroadcastq (%rdi), %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i64, i64* %p
@@ -2566,8 +2398,7 @@ define <4 x i64> @test_i64_to_4_mem(i64* %p) {
define <4 x i64> @test_masked_i64_to_4_mem_mask0(i64* %p, <4 x i64> %default, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_i64_to_4_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vpbroadcastq (%rdi), %ymm0 {%k1}
; CHECK-NEXT: retq
%s = load i64, i64* %p
@@ -2581,8 +2412,7 @@ define <4 x i64> @test_masked_i64_to_4_mem_mask0(i64* %p, <4 x i64> %default, <4
define <4 x i64> @test_masked_z_i64_to_4_mem_mask0(i64* %p, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_i64_to_4_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
; CHECK-NEXT: vpbroadcastq (%rdi), %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i64, i64* %p
@@ -2595,8 +2425,7 @@ define <4 x i64> @test_masked_z_i64_to_4_mem_mask0(i64* %p, <4 x i64> %mask) {
define <4 x i64> @test_masked_i64_to_4_mem_mask1(i64* %p, <4 x i64> %default, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_i64_to_4_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vpbroadcastq (%rdi), %ymm0 {%k1}
; CHECK-NEXT: retq
%s = load i64, i64* %p
@@ -2610,8 +2439,7 @@ define <4 x i64> @test_masked_i64_to_4_mem_mask1(i64* %p, <4 x i64> %default, <4
define <4 x i64> @test_masked_z_i64_to_4_mem_mask1(i64* %p, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_i64_to_4_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
; CHECK-NEXT: vpbroadcastq (%rdi), %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i64, i64* %p
@@ -2624,8 +2452,7 @@ define <4 x i64> @test_masked_z_i64_to_4_mem_mask1(i64* %p, <4 x i64> %mask) {
define <4 x i64> @test_masked_i64_to_4_mem_mask2(i64* %p, <4 x i64> %default, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_i64_to_4_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vpbroadcastq (%rdi), %ymm0 {%k1}
; CHECK-NEXT: retq
%s = load i64, i64* %p
@@ -2639,8 +2466,7 @@ define <4 x i64> @test_masked_i64_to_4_mem_mask2(i64* %p, <4 x i64> %default, <4
define <4 x i64> @test_masked_z_i64_to_4_mem_mask2(i64* %p, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_i64_to_4_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
; CHECK-NEXT: vpbroadcastq (%rdi), %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i64, i64* %p
@@ -2653,8 +2479,7 @@ define <4 x i64> @test_masked_z_i64_to_4_mem_mask2(i64* %p, <4 x i64> %mask) {
define <4 x i64> @test_masked_i64_to_4_mem_mask3(i64* %p, <4 x i64> %default, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_i64_to_4_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vpbroadcastq (%rdi), %ymm0 {%k1}
; CHECK-NEXT: retq
%s = load i64, i64* %p
@@ -2668,8 +2493,7 @@ define <4 x i64> @test_masked_i64_to_4_mem_mask3(i64* %p, <4 x i64> %default, <4
define <4 x i64> @test_masked_z_i64_to_4_mem_mask3(i64* %p, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_i64_to_4_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
; CHECK-NEXT: vpbroadcastq (%rdi), %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i64, i64* %p
@@ -2692,8 +2516,7 @@ define <8 x i64> @test_i64_to_8_mem(i64* %p) {
define <8 x i64> @test_masked_i64_to_8_mem_mask0(i64* %p, <8 x i64> %default, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_i64_to_8_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vpbroadcastq (%rdi), %zmm0 {%k1}
; CHECK-NEXT: retq
%s = load i64, i64* %p
@@ -2707,8 +2530,7 @@ define <8 x i64> @test_masked_i64_to_8_mem_mask0(i64* %p, <8 x i64> %default, <8
define <8 x i64> @test_masked_z_i64_to_8_mem_mask0(i64* %p, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_i64_to_8_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
; CHECK-NEXT: vpbroadcastq (%rdi), %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i64, i64* %p
@@ -2721,8 +2543,7 @@ define <8 x i64> @test_masked_z_i64_to_8_mem_mask0(i64* %p, <8 x i64> %mask) {
define <8 x i64> @test_masked_i64_to_8_mem_mask1(i64* %p, <8 x i64> %default, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_i64_to_8_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vpbroadcastq (%rdi), %zmm0 {%k1}
; CHECK-NEXT: retq
%s = load i64, i64* %p
@@ -2736,8 +2557,7 @@ define <8 x i64> @test_masked_i64_to_8_mem_mask1(i64* %p, <8 x i64> %default, <8
define <8 x i64> @test_masked_z_i64_to_8_mem_mask1(i64* %p, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_i64_to_8_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
; CHECK-NEXT: vpbroadcastq (%rdi), %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i64, i64* %p
@@ -2750,8 +2570,7 @@ define <8 x i64> @test_masked_z_i64_to_8_mem_mask1(i64* %p, <8 x i64> %mask) {
define <8 x i64> @test_masked_i64_to_8_mem_mask2(i64* %p, <8 x i64> %default, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_i64_to_8_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vpbroadcastq (%rdi), %zmm0 {%k1}
; CHECK-NEXT: retq
%s = load i64, i64* %p
@@ -2765,8 +2584,7 @@ define <8 x i64> @test_masked_i64_to_8_mem_mask2(i64* %p, <8 x i64> %default, <8
define <8 x i64> @test_masked_z_i64_to_8_mem_mask2(i64* %p, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_i64_to_8_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
; CHECK-NEXT: vpbroadcastq (%rdi), %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i64, i64* %p
@@ -2779,8 +2597,7 @@ define <8 x i64> @test_masked_z_i64_to_8_mem_mask2(i64* %p, <8 x i64> %mask) {
define <8 x i64> @test_masked_i64_to_8_mem_mask3(i64* %p, <8 x i64> %default, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_i64_to_8_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vpbroadcastq (%rdi), %zmm0 {%k1}
; CHECK-NEXT: retq
%s = load i64, i64* %p
@@ -2794,8 +2611,7 @@ define <8 x i64> @test_masked_i64_to_8_mem_mask3(i64* %p, <8 x i64> %default, <8
define <8 x i64> @test_masked_z_i64_to_8_mem_mask3(i64* %p, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_i64_to_8_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
; CHECK-NEXT: vpbroadcastq (%rdi), %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%s = load i64, i64* %p
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll b/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll
index b6b5a6bcdca..218aa3ffe07 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll
@@ -14,8 +14,7 @@ define <4 x i32> @test_2xi32_to_4xi32(<4 x i32> %vec) {
define <4 x i32> @test_masked_2xi32_to_4xi32_mask0(<4 x i32> %vec, <4 x i32> %default, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_2xi32_to_4xi32_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm1 {%k1} = xmm0[0,1,0,1]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -28,8 +27,7 @@ define <4 x i32> @test_masked_2xi32_to_4xi32_mask0(<4 x i32> %vec, <4 x i32> %de
define <4 x i32> @test_masked_z_2xi32_to_4xi32_mask0(<4 x i32> %vec, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,0,1]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
@@ -40,8 +38,7 @@ define <4 x i32> @test_masked_z_2xi32_to_4xi32_mask0(<4 x i32> %vec, <4 x i32> %
define <4 x i32> @test_masked_2xi32_to_4xi32_mask1(<4 x i32> %vec, <4 x i32> %default, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_2xi32_to_4xi32_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm1 {%k1} = xmm0[0,1,0,1]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -54,8 +51,7 @@ define <4 x i32> @test_masked_2xi32_to_4xi32_mask1(<4 x i32> %vec, <4 x i32> %de
define <4 x i32> @test_masked_z_2xi32_to_4xi32_mask1(<4 x i32> %vec, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,0,1]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
@@ -66,8 +62,7 @@ define <4 x i32> @test_masked_z_2xi32_to_4xi32_mask1(<4 x i32> %vec, <4 x i32> %
define <4 x i32> @test_masked_2xi32_to_4xi32_mask2(<4 x i32> %vec, <4 x i32> %default, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_2xi32_to_4xi32_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm1 {%k1} = xmm0[0,1,0,1]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -80,8 +75,7 @@ define <4 x i32> @test_masked_2xi32_to_4xi32_mask2(<4 x i32> %vec, <4 x i32> %de
define <4 x i32> @test_masked_z_2xi32_to_4xi32_mask2(<4 x i32> %vec, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,0,1]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
@@ -92,8 +86,7 @@ define <4 x i32> @test_masked_z_2xi32_to_4xi32_mask2(<4 x i32> %vec, <4 x i32> %
define <4 x i32> @test_masked_2xi32_to_4xi32_mask3(<4 x i32> %vec, <4 x i32> %default, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_2xi32_to_4xi32_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm1 {%k1} = xmm0[0,1,0,1]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -106,8 +99,7 @@ define <4 x i32> @test_masked_2xi32_to_4xi32_mask3(<4 x i32> %vec, <4 x i32> %de
define <4 x i32> @test_masked_z_2xi32_to_4xi32_mask3(<4 x i32> %vec, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,0,1]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
@@ -126,8 +118,7 @@ define <8 x i32> @test_2xi32_to_8xi32(<8 x i32> %vec) {
define <8 x i32> @test_masked_2xi32_to_8xi32_mask0(<8 x i32> %vec, <8 x i32> %default, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_2xi32_to_8xi32_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -140,8 +131,7 @@ define <8 x i32> @test_masked_2xi32_to_8xi32_mask0(<8 x i32> %vec, <8 x i32> %de
define <8 x i32> @test_masked_z_2xi32_to_8xi32_mask0(<8 x i32> %vec, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -152,8 +142,7 @@ define <8 x i32> @test_masked_z_2xi32_to_8xi32_mask0(<8 x i32> %vec, <8 x i32> %
define <8 x i32> @test_masked_2xi32_to_8xi32_mask1(<8 x i32> %vec, <8 x i32> %default, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_2xi32_to_8xi32_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -166,8 +155,7 @@ define <8 x i32> @test_masked_2xi32_to_8xi32_mask1(<8 x i32> %vec, <8 x i32> %de
define <8 x i32> @test_masked_z_2xi32_to_8xi32_mask1(<8 x i32> %vec, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -178,8 +166,7 @@ define <8 x i32> @test_masked_z_2xi32_to_8xi32_mask1(<8 x i32> %vec, <8 x i32> %
define <8 x i32> @test_masked_2xi32_to_8xi32_mask2(<8 x i32> %vec, <8 x i32> %default, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_2xi32_to_8xi32_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -192,8 +179,7 @@ define <8 x i32> @test_masked_2xi32_to_8xi32_mask2(<8 x i32> %vec, <8 x i32> %de
define <8 x i32> @test_masked_z_2xi32_to_8xi32_mask2(<8 x i32> %vec, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -204,8 +190,7 @@ define <8 x i32> @test_masked_z_2xi32_to_8xi32_mask2(<8 x i32> %vec, <8 x i32> %
define <8 x i32> @test_masked_2xi32_to_8xi32_mask3(<8 x i32> %vec, <8 x i32> %default, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_2xi32_to_8xi32_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -218,8 +203,7 @@ define <8 x i32> @test_masked_2xi32_to_8xi32_mask3(<8 x i32> %vec, <8 x i32> %de
define <8 x i32> @test_masked_z_2xi32_to_8xi32_mask3(<8 x i32> %vec, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -238,8 +222,7 @@ define <16 x i32> @test_2xi32_to_16xi32(<16 x i32> %vec) {
define <16 x i32> @test_masked_2xi32_to_16xi32_mask0(<16 x i32> %vec, <16 x i32> %default, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_2xi32_to_16xi32_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -252,8 +235,7 @@ define <16 x i32> @test_masked_2xi32_to_16xi32_mask0(<16 x i32> %vec, <16 x i32>
define <16 x i32> @test_masked_z_2xi32_to_16xi32_mask0(<16 x i32> %vec, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -264,8 +246,7 @@ define <16 x i32> @test_masked_z_2xi32_to_16xi32_mask0(<16 x i32> %vec, <16 x i3
define <16 x i32> @test_masked_2xi32_to_16xi32_mask1(<16 x i32> %vec, <16 x i32> %default, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_2xi32_to_16xi32_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -278,8 +259,7 @@ define <16 x i32> @test_masked_2xi32_to_16xi32_mask1(<16 x i32> %vec, <16 x i32>
define <16 x i32> @test_masked_z_2xi32_to_16xi32_mask1(<16 x i32> %vec, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -290,8 +270,7 @@ define <16 x i32> @test_masked_z_2xi32_to_16xi32_mask1(<16 x i32> %vec, <16 x i3
define <16 x i32> @test_masked_2xi32_to_16xi32_mask2(<16 x i32> %vec, <16 x i32> %default, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_2xi32_to_16xi32_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -304,8 +283,7 @@ define <16 x i32> @test_masked_2xi32_to_16xi32_mask2(<16 x i32> %vec, <16 x i32>
define <16 x i32> @test_masked_z_2xi32_to_16xi32_mask2(<16 x i32> %vec, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -316,8 +294,7 @@ define <16 x i32> @test_masked_z_2xi32_to_16xi32_mask2(<16 x i32> %vec, <16 x i3
define <16 x i32> @test_masked_2xi32_to_16xi32_mask3(<16 x i32> %vec, <16 x i32> %default, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_2xi32_to_16xi32_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -330,8 +307,7 @@ define <16 x i32> @test_masked_2xi32_to_16xi32_mask3(<16 x i32> %vec, <16 x i32>
define <16 x i32> @test_masked_z_2xi32_to_16xi32_mask3(<16 x i32> %vec, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -351,8 +327,7 @@ define <4 x i32> @test_2xi32_to_4xi32_mem(<2 x i32>* %vp) {
define <4 x i32> @test_masked_2xi32_to_4xi32_mem_mask0(<2 x i32>* %vp, <4 x i32> %default, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_2xi32_to_4xi32_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} = mem[0,1,0,1]
; CHECK-NEXT: retq
%vec = load <2 x i32>, <2 x i32>* %vp
@@ -365,8 +340,7 @@ define <4 x i32> @test_masked_2xi32_to_4xi32_mem_mask0(<2 x i32>* %vp, <4 x i32>
define <4 x i32> @test_masked_z_2xi32_to_4xi32_mem_mask0(<2 x i32>* %vp, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = mem[0,1,0,1]
; CHECK-NEXT: retq
%vec = load <2 x i32>, <2 x i32>* %vp
@@ -378,8 +352,7 @@ define <4 x i32> @test_masked_z_2xi32_to_4xi32_mem_mask0(<2 x i32>* %vp, <4 x i3
define <4 x i32> @test_masked_2xi32_to_4xi32_mem_mask1(<2 x i32>* %vp, <4 x i32> %default, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_2xi32_to_4xi32_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} = mem[0,1,0,1]
; CHECK-NEXT: retq
%vec = load <2 x i32>, <2 x i32>* %vp
@@ -392,8 +365,7 @@ define <4 x i32> @test_masked_2xi32_to_4xi32_mem_mask1(<2 x i32>* %vp, <4 x i32>
define <4 x i32> @test_masked_z_2xi32_to_4xi32_mem_mask1(<2 x i32>* %vp, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = mem[0,1,0,1]
; CHECK-NEXT: retq
%vec = load <2 x i32>, <2 x i32>* %vp
@@ -405,8 +377,7 @@ define <4 x i32> @test_masked_z_2xi32_to_4xi32_mem_mask1(<2 x i32>* %vp, <4 x i3
define <4 x i32> @test_masked_2xi32_to_4xi32_mem_mask2(<2 x i32>* %vp, <4 x i32> %default, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_2xi32_to_4xi32_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} = mem[0,1,0,1]
; CHECK-NEXT: retq
%vec = load <2 x i32>, <2 x i32>* %vp
@@ -419,8 +390,7 @@ define <4 x i32> @test_masked_2xi32_to_4xi32_mem_mask2(<2 x i32>* %vp, <4 x i32>
define <4 x i32> @test_masked_z_2xi32_to_4xi32_mem_mask2(<2 x i32>* %vp, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = mem[0,1,0,1]
; CHECK-NEXT: retq
%vec = load <2 x i32>, <2 x i32>* %vp
@@ -432,8 +402,7 @@ define <4 x i32> @test_masked_z_2xi32_to_4xi32_mem_mask2(<2 x i32>* %vp, <4 x i3
define <4 x i32> @test_masked_2xi32_to_4xi32_mem_mask3(<2 x i32>* %vp, <4 x i32> %default, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_2xi32_to_4xi32_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} = mem[0,1,0,1]
; CHECK-NEXT: retq
%vec = load <2 x i32>, <2 x i32>* %vp
@@ -446,8 +415,7 @@ define <4 x i32> @test_masked_2xi32_to_4xi32_mem_mask3(<2 x i32>* %vp, <4 x i32>
define <4 x i32> @test_masked_z_2xi32_to_4xi32_mem_mask3(<2 x i32>* %vp, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = mem[0,1,0,1]
; CHECK-NEXT: retq
%vec = load <2 x i32>, <2 x i32>* %vp
@@ -468,8 +436,7 @@ define <8 x i32> @test_2xi32_to_8xi32_mem(<2 x i32>* %vp) {
define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask0(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1,0,1,0,1]
; CHECK-NEXT: retq
%vec = load <2 x i32>, <2 x i32>* %vp
@@ -482,8 +449,7 @@ define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask0(<2 x i32>* %vp, <8 x i32>
define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask0(<2 x i32>* %vp, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
; CHECK-NEXT: retq
%vec = load <2 x i32>, <2 x i32>* %vp
@@ -495,8 +461,7 @@ define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask0(<2 x i32>* %vp, <8 x i3
define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask1(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1,0,1,0,1]
; CHECK-NEXT: retq
%vec = load <2 x i32>, <2 x i32>* %vp
@@ -509,8 +474,7 @@ define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask1(<2 x i32>* %vp, <8 x i32>
define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask1(<2 x i32>* %vp, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
; CHECK-NEXT: retq
%vec = load <2 x i32>, <2 x i32>* %vp
@@ -522,8 +486,7 @@ define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask1(<2 x i32>* %vp, <8 x i3
define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask2(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1,0,1,0,1]
; CHECK-NEXT: retq
%vec = load <2 x i32>, <2 x i32>* %vp
@@ -536,8 +499,7 @@ define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask2(<2 x i32>* %vp, <8 x i32>
define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask2(<2 x i32>* %vp, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
; CHECK-NEXT: retq
%vec = load <2 x i32>, <2 x i32>* %vp
@@ -549,8 +511,7 @@ define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask2(<2 x i32>* %vp, <8 x i3
define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask3(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1,0,1,0,1]
; CHECK-NEXT: retq
%vec = load <2 x i32>, <2 x i32>* %vp
@@ -563,8 +524,7 @@ define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask3(<2 x i32>* %vp, <8 x i32>
define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask3(<2 x i32>* %vp, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
; CHECK-NEXT: retq
%vec = load <2 x i32>, <2 x i32>* %vp
@@ -589,8 +549,7 @@ define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask0(<2 x i32>* %vp, <16 x i
; CHECK: # %bb.0:
; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqd %zmm4, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermd %zmm2, %zmm3, %zmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <2 x i32>, <2 x i32>* %vp
@@ -605,8 +564,7 @@ define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask0(<2 x i32>* %vp, <16 x
; CHECK: # %bb.0:
; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %zmm3, %zmm0, %k1
+; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
; CHECK-NEXT: vpermd %zmm1, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <2 x i32>, <2 x i32>* %vp
@@ -620,8 +578,7 @@ define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask1(<2 x i32>* %vp, <16 x i
; CHECK: # %bb.0:
; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqd %zmm4, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermd %zmm2, %zmm3, %zmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <2 x i32>, <2 x i32>* %vp
@@ -636,8 +593,7 @@ define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask1(<2 x i32>* %vp, <16 x
; CHECK: # %bb.0:
; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %zmm3, %zmm0, %k1
+; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
; CHECK-NEXT: vpermd %zmm1, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <2 x i32>, <2 x i32>* %vp
@@ -651,8 +607,7 @@ define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask2(<2 x i32>* %vp, <16 x i
; CHECK: # %bb.0:
; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqd %zmm4, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermd %zmm2, %zmm3, %zmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <2 x i32>, <2 x i32>* %vp
@@ -667,8 +622,7 @@ define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask2(<2 x i32>* %vp, <16 x
; CHECK: # %bb.0:
; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %zmm3, %zmm0, %k1
+; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
; CHECK-NEXT: vpermd %zmm1, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <2 x i32>, <2 x i32>* %vp
@@ -682,8 +636,7 @@ define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask3(<2 x i32>* %vp, <16 x i
; CHECK: # %bb.0:
; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqd %zmm4, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermd %zmm2, %zmm3, %zmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <2 x i32>, <2 x i32>* %vp
@@ -698,8 +651,7 @@ define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask3(<2 x i32>* %vp, <16 x
; CHECK: # %bb.0:
; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %zmm3, %zmm0, %k1
+; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
; CHECK-NEXT: vpermd %zmm1, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <2 x i32>, <2 x i32>* %vp
@@ -720,8 +672,7 @@ define <8 x i32> @test_4xi32_to_8xi32_mem(<4 x i32>* %vp) {
define <8 x i32> @test_masked_4xi32_to_8xi32_mem_mask0(<4 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_4xi32_to_8xi32_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -734,8 +685,7 @@ define <8 x i32> @test_masked_4xi32_to_8xi32_mem_mask0(<4 x i32>* %vp, <8 x i32>
define <8 x i32> @test_masked_z_4xi32_to_8xi32_mem_mask0(<4 x i32>* %vp, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_4xi32_to_8xi32_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -747,8 +697,7 @@ define <8 x i32> @test_masked_z_4xi32_to_8xi32_mem_mask0(<4 x i32>* %vp, <8 x i3
define <8 x i32> @test_masked_4xi32_to_8xi32_mem_mask1(<4 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_4xi32_to_8xi32_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -761,8 +710,7 @@ define <8 x i32> @test_masked_4xi32_to_8xi32_mem_mask1(<4 x i32>* %vp, <8 x i32>
define <8 x i32> @test_masked_z_4xi32_to_8xi32_mem_mask1(<4 x i32>* %vp, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_4xi32_to_8xi32_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -774,8 +722,7 @@ define <8 x i32> @test_masked_z_4xi32_to_8xi32_mem_mask1(<4 x i32>* %vp, <8 x i3
define <8 x i32> @test_masked_4xi32_to_8xi32_mem_mask2(<4 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_4xi32_to_8xi32_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -788,8 +735,7 @@ define <8 x i32> @test_masked_4xi32_to_8xi32_mem_mask2(<4 x i32>* %vp, <8 x i32>
define <8 x i32> @test_masked_z_4xi32_to_8xi32_mem_mask2(<4 x i32>* %vp, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_4xi32_to_8xi32_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -801,8 +747,7 @@ define <8 x i32> @test_masked_z_4xi32_to_8xi32_mem_mask2(<4 x i32>* %vp, <8 x i3
define <8 x i32> @test_masked_4xi32_to_8xi32_mem_mask3(<4 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_4xi32_to_8xi32_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -815,8 +760,7 @@ define <8 x i32> @test_masked_4xi32_to_8xi32_mem_mask3(<4 x i32>* %vp, <8 x i32>
define <8 x i32> @test_masked_z_4xi32_to_8xi32_mem_mask3(<4 x i32>* %vp, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_4xi32_to_8xi32_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -837,8 +781,7 @@ define <16 x i32> @test_4xi32_to_16xi32_mem(<4 x i32>* %vp) {
define <16 x i32> @test_masked_4xi32_to_16xi32_mem_mask0(<4 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_4xi32_to_16xi32_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -851,8 +794,7 @@ define <16 x i32> @test_masked_4xi32_to_16xi32_mem_mask0(<4 x i32>* %vp, <16 x i
define <16 x i32> @test_masked_z_4xi32_to_16xi32_mem_mask0(<4 x i32>* %vp, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_4xi32_to_16xi32_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -864,8 +806,7 @@ define <16 x i32> @test_masked_z_4xi32_to_16xi32_mem_mask0(<4 x i32>* %vp, <16 x
define <16 x i32> @test_masked_4xi32_to_16xi32_mem_mask1(<4 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_4xi32_to_16xi32_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -878,8 +819,7 @@ define <16 x i32> @test_masked_4xi32_to_16xi32_mem_mask1(<4 x i32>* %vp, <16 x i
define <16 x i32> @test_masked_z_4xi32_to_16xi32_mem_mask1(<4 x i32>* %vp, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_4xi32_to_16xi32_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -891,8 +831,7 @@ define <16 x i32> @test_masked_z_4xi32_to_16xi32_mem_mask1(<4 x i32>* %vp, <16 x
define <16 x i32> @test_masked_4xi32_to_16xi32_mem_mask2(<4 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_4xi32_to_16xi32_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -905,8 +844,7 @@ define <16 x i32> @test_masked_4xi32_to_16xi32_mem_mask2(<4 x i32>* %vp, <16 x i
define <16 x i32> @test_masked_z_4xi32_to_16xi32_mem_mask2(<4 x i32>* %vp, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_4xi32_to_16xi32_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -918,8 +856,7 @@ define <16 x i32> @test_masked_z_4xi32_to_16xi32_mem_mask2(<4 x i32>* %vp, <16 x
define <16 x i32> @test_masked_4xi32_to_16xi32_mem_mask3(<4 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_4xi32_to_16xi32_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -932,8 +869,7 @@ define <16 x i32> @test_masked_4xi32_to_16xi32_mem_mask3(<4 x i32>* %vp, <16 x i
define <16 x i32> @test_masked_z_4xi32_to_16xi32_mem_mask3(<4 x i32>* %vp, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_4xi32_to_16xi32_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -954,8 +890,7 @@ define <4 x i64> @test_2xi64_to_4xi64_mem(<2 x i64>* %vp) {
define <4 x i64> @test_masked_2xi64_to_4xi64_mem_mask0(<2 x i64>* %vp, <4 x i64> %default, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_2xi64_to_4xi64_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1]
; CHECK-NEXT: retq
%vec = load <2 x i64>, <2 x i64>* %vp
@@ -968,8 +903,7 @@ define <4 x i64> @test_masked_2xi64_to_4xi64_mem_mask0(<2 x i64>* %vp, <4 x i64>
define <4 x i64> @test_masked_z_2xi64_to_4xi64_mem_mask0(<2 x i64>* %vp, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_2xi64_to_4xi64_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1]
; CHECK-NEXT: retq
%vec = load <2 x i64>, <2 x i64>* %vp
@@ -981,8 +915,7 @@ define <4 x i64> @test_masked_z_2xi64_to_4xi64_mem_mask0(<2 x i64>* %vp, <4 x i6
define <4 x i64> @test_masked_2xi64_to_4xi64_mem_mask1(<2 x i64>* %vp, <4 x i64> %default, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_2xi64_to_4xi64_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1]
; CHECK-NEXT: retq
%vec = load <2 x i64>, <2 x i64>* %vp
@@ -995,8 +928,7 @@ define <4 x i64> @test_masked_2xi64_to_4xi64_mem_mask1(<2 x i64>* %vp, <4 x i64>
define <4 x i64> @test_masked_z_2xi64_to_4xi64_mem_mask1(<2 x i64>* %vp, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_2xi64_to_4xi64_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1]
; CHECK-NEXT: retq
%vec = load <2 x i64>, <2 x i64>* %vp
@@ -1008,8 +940,7 @@ define <4 x i64> @test_masked_z_2xi64_to_4xi64_mem_mask1(<2 x i64>* %vp, <4 x i6
define <4 x i64> @test_masked_2xi64_to_4xi64_mem_mask2(<2 x i64>* %vp, <4 x i64> %default, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_2xi64_to_4xi64_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1]
; CHECK-NEXT: retq
%vec = load <2 x i64>, <2 x i64>* %vp
@@ -1022,8 +953,7 @@ define <4 x i64> @test_masked_2xi64_to_4xi64_mem_mask2(<2 x i64>* %vp, <4 x i64>
define <4 x i64> @test_masked_z_2xi64_to_4xi64_mem_mask2(<2 x i64>* %vp, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_2xi64_to_4xi64_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1]
; CHECK-NEXT: retq
%vec = load <2 x i64>, <2 x i64>* %vp
@@ -1035,8 +965,7 @@ define <4 x i64> @test_masked_z_2xi64_to_4xi64_mem_mask2(<2 x i64>* %vp, <4 x i6
define <4 x i64> @test_masked_2xi64_to_4xi64_mem_mask3(<2 x i64>* %vp, <4 x i64> %default, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_2xi64_to_4xi64_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1]
; CHECK-NEXT: retq
%vec = load <2 x i64>, <2 x i64>* %vp
@@ -1049,8 +978,7 @@ define <4 x i64> @test_masked_2xi64_to_4xi64_mem_mask3(<2 x i64>* %vp, <4 x i64>
define <4 x i64> @test_masked_z_2xi64_to_4xi64_mem_mask3(<2 x i64>* %vp, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_2xi64_to_4xi64_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1]
; CHECK-NEXT: retq
%vec = load <2 x i64>, <2 x i64>* %vp
@@ -1071,8 +999,7 @@ define <8 x i64> @test_2xi64_to_8xi64_mem(<2 x i64>* %vp) {
define <8 x i64> @test_masked_2xi64_to_8xi64_mem_mask0(<2 x i64>* %vp, <8 x i64> %default, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_2xi64_to_8xi64_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1]
; CHECK-NEXT: retq
%vec = load <2 x i64>, <2 x i64>* %vp
@@ -1085,8 +1012,7 @@ define <8 x i64> @test_masked_2xi64_to_8xi64_mem_mask0(<2 x i64>* %vp, <8 x i64>
define <8 x i64> @test_masked_z_2xi64_to_8xi64_mem_mask0(<2 x i64>* %vp, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_2xi64_to_8xi64_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
; CHECK-NEXT: retq
%vec = load <2 x i64>, <2 x i64>* %vp
@@ -1098,8 +1024,7 @@ define <8 x i64> @test_masked_z_2xi64_to_8xi64_mem_mask0(<2 x i64>* %vp, <8 x i6
define <8 x i64> @test_masked_2xi64_to_8xi64_mem_mask1(<2 x i64>* %vp, <8 x i64> %default, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_2xi64_to_8xi64_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1]
; CHECK-NEXT: retq
%vec = load <2 x i64>, <2 x i64>* %vp
@@ -1112,8 +1037,7 @@ define <8 x i64> @test_masked_2xi64_to_8xi64_mem_mask1(<2 x i64>* %vp, <8 x i64>
define <8 x i64> @test_masked_z_2xi64_to_8xi64_mem_mask1(<2 x i64>* %vp, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_2xi64_to_8xi64_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
; CHECK-NEXT: retq
%vec = load <2 x i64>, <2 x i64>* %vp
@@ -1125,8 +1049,7 @@ define <8 x i64> @test_masked_z_2xi64_to_8xi64_mem_mask1(<2 x i64>* %vp, <8 x i6
define <8 x i64> @test_masked_2xi64_to_8xi64_mem_mask2(<2 x i64>* %vp, <8 x i64> %default, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_2xi64_to_8xi64_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1]
; CHECK-NEXT: retq
%vec = load <2 x i64>, <2 x i64>* %vp
@@ -1139,8 +1062,7 @@ define <8 x i64> @test_masked_2xi64_to_8xi64_mem_mask2(<2 x i64>* %vp, <8 x i64>
define <8 x i64> @test_masked_z_2xi64_to_8xi64_mem_mask2(<2 x i64>* %vp, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_2xi64_to_8xi64_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
; CHECK-NEXT: retq
%vec = load <2 x i64>, <2 x i64>* %vp
@@ -1152,8 +1074,7 @@ define <8 x i64> @test_masked_z_2xi64_to_8xi64_mem_mask2(<2 x i64>* %vp, <8 x i6
define <8 x i64> @test_masked_2xi64_to_8xi64_mem_mask3(<2 x i64>* %vp, <8 x i64> %default, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_2xi64_to_8xi64_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1]
; CHECK-NEXT: retq
%vec = load <2 x i64>, <2 x i64>* %vp
@@ -1166,8 +1087,7 @@ define <8 x i64> @test_masked_2xi64_to_8xi64_mem_mask3(<2 x i64>* %vp, <8 x i64>
define <8 x i64> @test_masked_z_2xi64_to_8xi64_mem_mask3(<2 x i64>* %vp, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_2xi64_to_8xi64_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
; CHECK-NEXT: retq
%vec = load <2 x i64>, <2 x i64>* %vp
@@ -1188,8 +1108,7 @@ define <16 x i32> @test_8xi32_to_16xi32_mem(<8 x i32>* %vp) {
define <16 x i32> @test_masked_8xi32_to_16xi32_mem_mask0(<8 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_to_16xi32_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -1202,8 +1121,7 @@ define <16 x i32> @test_masked_8xi32_to_16xi32_mem_mask0(<8 x i32>* %vp, <16 x i
define <16 x i32> @test_masked_z_8xi32_to_16xi32_mem_mask0(<8 x i32>* %vp, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_to_16xi32_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
; CHECK-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -1215,8 +1133,7 @@ define <16 x i32> @test_masked_z_8xi32_to_16xi32_mem_mask0(<8 x i32>* %vp, <16 x
define <16 x i32> @test_masked_8xi32_to_16xi32_mem_mask1(<8 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_to_16xi32_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -1229,8 +1146,7 @@ define <16 x i32> @test_masked_8xi32_to_16xi32_mem_mask1(<8 x i32>* %vp, <16 x i
define <16 x i32> @test_masked_z_8xi32_to_16xi32_mem_mask1(<8 x i32>* %vp, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_to_16xi32_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
; CHECK-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -1242,8 +1158,7 @@ define <16 x i32> @test_masked_z_8xi32_to_16xi32_mem_mask1(<8 x i32>* %vp, <16 x
define <16 x i32> @test_masked_8xi32_to_16xi32_mem_mask2(<8 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_to_16xi32_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -1256,8 +1171,7 @@ define <16 x i32> @test_masked_8xi32_to_16xi32_mem_mask2(<8 x i32>* %vp, <16 x i
define <16 x i32> @test_masked_z_8xi32_to_16xi32_mem_mask2(<8 x i32>* %vp, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_to_16xi32_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
; CHECK-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -1269,8 +1183,7 @@ define <16 x i32> @test_masked_z_8xi32_to_16xi32_mem_mask2(<8 x i32>* %vp, <16 x
define <16 x i32> @test_masked_8xi32_to_16xi32_mem_mask3(<8 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_to_16xi32_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -1283,8 +1196,7 @@ define <16 x i32> @test_masked_8xi32_to_16xi32_mem_mask3(<8 x i32>* %vp, <16 x i
define <16 x i32> @test_masked_z_8xi32_to_16xi32_mem_mask3(<8 x i32>* %vp, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_to_16xi32_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
; CHECK-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -1305,8 +1217,7 @@ define <8 x i64> @test_4xi64_to_8xi64_mem(<4 x i64>* %vp) {
define <8 x i64> @test_masked_4xi64_to_8xi64_mem_mask0(<4 x i64>* %vp, <8 x i64> %default, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_4xi64_to_8xi64_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq
%vec = load <4 x i64>, <4 x i64>* %vp
@@ -1319,8 +1230,7 @@ define <8 x i64> @test_masked_4xi64_to_8xi64_mem_mask0(<4 x i64>* %vp, <8 x i64>
define <8 x i64> @test_masked_z_4xi64_to_8xi64_mem_mask0(<4 x i64>* %vp, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_4xi64_to_8xi64_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq
%vec = load <4 x i64>, <4 x i64>* %vp
@@ -1332,8 +1242,7 @@ define <8 x i64> @test_masked_z_4xi64_to_8xi64_mem_mask0(<4 x i64>* %vp, <8 x i6
define <8 x i64> @test_masked_4xi64_to_8xi64_mem_mask1(<4 x i64>* %vp, <8 x i64> %default, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_4xi64_to_8xi64_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq
%vec = load <4 x i64>, <4 x i64>* %vp
@@ -1346,8 +1255,7 @@ define <8 x i64> @test_masked_4xi64_to_8xi64_mem_mask1(<4 x i64>* %vp, <8 x i64>
define <8 x i64> @test_masked_z_4xi64_to_8xi64_mem_mask1(<4 x i64>* %vp, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_4xi64_to_8xi64_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq
%vec = load <4 x i64>, <4 x i64>* %vp
@@ -1359,8 +1267,7 @@ define <8 x i64> @test_masked_z_4xi64_to_8xi64_mem_mask1(<4 x i64>* %vp, <8 x i6
define <8 x i64> @test_masked_4xi64_to_8xi64_mem_mask2(<4 x i64>* %vp, <8 x i64> %default, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_4xi64_to_8xi64_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq
%vec = load <4 x i64>, <4 x i64>* %vp
@@ -1373,8 +1280,7 @@ define <8 x i64> @test_masked_4xi64_to_8xi64_mem_mask2(<4 x i64>* %vp, <8 x i64>
define <8 x i64> @test_masked_z_4xi64_to_8xi64_mem_mask2(<4 x i64>* %vp, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_4xi64_to_8xi64_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq
%vec = load <4 x i64>, <4 x i64>* %vp
@@ -1386,8 +1292,7 @@ define <8 x i64> @test_masked_z_4xi64_to_8xi64_mem_mask2(<4 x i64>* %vp, <8 x i6
define <8 x i64> @test_masked_4xi64_to_8xi64_mem_mask3(<4 x i64>* %vp, <8 x i64> %default, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_4xi64_to_8xi64_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq
%vec = load <4 x i64>, <4 x i64>* %vp
@@ -1400,8 +1305,7 @@ define <8 x i64> @test_masked_4xi64_to_8xi64_mem_mask3(<4 x i64>* %vp, <8 x i64>
define <8 x i64> @test_masked_z_4xi64_to_8xi64_mem_mask3(<4 x i64>* %vp, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_4xi64_to_8xi64_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq
%vec = load <4 x i64>, <4 x i64>* %vp
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index abed2c04275..decaec05c67 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -22,8 +22,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,10,11,0,1,2,3,12,13,0,1]
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3,4],xmm0[5,6,7]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -40,8 +39,7 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,10,11,0,1,2,3,12,13,0,1]
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3,4],xmm0[5,6,7]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -57,8 +55,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13]
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -75,8 +72,7 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13]
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -92,8 +88,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,12,13,4,5,0,1,2,3,12,13,2,3]
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3],xmm3[4,5,6],xmm0[7]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -110,8 +105,7 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,12,13,4,5,0,1,2,3,12,13,2,3]
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3],xmm2[4,5,6],xmm0[7]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -139,8 +133,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,10,11,8,9,8,9,0,1,2,3]
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4],xmm0[5,6],xmm3[7]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -157,8 +150,7 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,10,11,8,9,8,9,0,1,2,3]
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4],xmm0[5,6],xmm2[7]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -189,8 +181,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, <8
; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3]
; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6],xmm3[7]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %xmm3, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -209,8 +200,7 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp,
; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3]
; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6],xmm2[7]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm0, %k1
+; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -229,8 +219,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, <8
; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,14,15,8,9,14,15,0,1,2,3,0,1,12,13]
; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,14,15,4,5,14,15,2,3,10,11,0,1,2,3]
; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3,4,5],xmm3[6,7]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %xmm3, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -249,8 +238,7 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp,
; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,14,15,8,9,14,15,0,1,2,3,0,1,12,13]
; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,14,15,2,3,10,11,0,1,2,3]
; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3,4,5],xmm2[6,7]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm0, %k1
+; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -269,8 +257,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8
; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2
; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,0,1,6,7,0,1,10,11,0,1,14,15,2,3]
; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %xmm3, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -289,8 +276,7 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp,
; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1
; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,6,7,0,1,10,11,0,1,14,15,2,3]
; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm0, %k1
+; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -321,8 +307,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, <8
; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3
; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3]
; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %xmm3, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -340,8 +325,7 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp,
; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2
; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm0, %k1
+; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -369,8 +353,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpcmpeqw %ymm0, %ymm2, %k1
+; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
@@ -384,8 +367,7 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <1
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqw %ymm4, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
@@ -400,8 +382,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26]
; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpcmpeqw %ymm0, %ymm2, %k1
+; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10>
@@ -415,8 +396,7 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <1
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqw %ymm4, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
@@ -431,8 +411,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15]
; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpcmpeqw %ymm0, %ymm2, %k1
+; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31>
@@ -446,8 +425,7 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <1
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqw %ymm4, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
@@ -473,8 +451,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpcmpeqw %ymm0, %ymm2, %k1
+; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
@@ -488,8 +465,7 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <1
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqw %ymm4, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm2 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
@@ -516,8 +492,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <22,27,7,10,13,21,5,14,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpcmpeqw %xmm0, %xmm2, %k1
+; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -533,8 +508,7 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <22,27,7,10,13,21,5,14,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w %ymm0, %ymm2, %ymm3
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpcmpeqw %xmm0, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -549,8 +523,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <1,21,27,10,8,19,14,5,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpcmpeqw %xmm0, %xmm2, %k1
+; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -566,8 +539,7 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <1,21,27,10,8,19,14,5,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm3
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpcmpeqw %xmm0, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -582,8 +554,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <15,13,18,16,9,11,26,8,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpcmpeqw %xmm0, %xmm2, %k1
+; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -599,8 +570,7 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <15,13,18,16,9,11,26,8,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm3
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpcmpeqw %xmm0, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -627,8 +597,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <17,0,23,10,1,8,7,30,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpcmpeqw %xmm0, %xmm2, %k1
+; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -644,8 +613,7 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <17,0,23,10,1,8,7,30,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm3
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpcmpeqw %xmm0, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -673,8 +641,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp,
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm4
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vmovdqu16 %ymm4, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -690,8 +657,7 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqw %ymm4, %ymm0, %k1
+; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm1 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -709,8 +675,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask1(<32 x i16>* %vp,
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm4
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vmovdqu16 %ymm4, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -726,8 +691,7 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask1(<32 x i16>* %vp
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqw %ymm4, %ymm0, %k1
+; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm1 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -745,8 +709,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask2(<32 x i16>* %vp,
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0]
; CHECK-NEXT: vpermi2w %ymm2, %ymm3, %ymm4
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vmovdqu16 %ymm4, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -762,8 +725,7 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask2(<32 x i16>* %vp
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqw %ymm4, %ymm0, %k1
+; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermi2w %ymm2, %ymm3, %ymm1 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -793,8 +755,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp,
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm4
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vmovdqu16 %ymm4, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -810,8 +771,7 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqw %ymm4, %ymm0, %k1
+; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm1 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -843,8 +803,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp, <8
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w %ymm2, %ymm3, %ymm4
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm4, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -862,8 +821,7 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp,
; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w %ymm1, %ymm2, %ymm3
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -881,8 +839,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp, <8
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <7,6,4,6,12,4,27,1,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w %ymm2, %ymm3, %ymm4
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm4, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -900,8 +857,7 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp,
; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <7,6,4,6,12,4,27,1,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w %ymm1, %ymm2, %ymm3
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -919,8 +875,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp, <8
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <6,18,0,4,10,25,22,10,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm4
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm4, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -938,8 +893,7 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp,
; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <6,18,0,4,10,25,22,10,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w %ymm2, %ymm1, %ymm3
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -971,8 +925,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp, <8
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm4
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm4, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -990,8 +943,7 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp,
; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w %ymm2, %ymm1, %ymm3
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -1019,8 +971,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,2]
; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -1036,8 +987,7 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,2]
; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -1053,8 +1003,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32
; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,0,2,3]
; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -1071,8 +1020,7 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i
; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,0,2,3]
; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -1086,8 +1034,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm3[1],xmm0[1]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -1102,8 +1049,7 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm0[1]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -1128,8 +1074,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,3,2,1]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
@@ -1145,8 +1090,7 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,3,2,1]
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -1173,8 +1117,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp, <4 x
; CHECK-NEXT: vmovaps (%rdi), %ymm2
; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3
; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,1],xmm2[0,0]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %xmm3, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -1191,8 +1134,7 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp, <4
; CHECK-NEXT: vmovaps (%rdi), %ymm1
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,1],xmm1[0,0]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
+; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -1209,8 +1151,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 x
; CHECK-NEXT: vmovdqa (%rdi), %ymm2
; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3
; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %xmm3, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = xmm2[1,0,0,3]
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -1227,8 +1168,7 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4
; CHECK-NEXT: vmovdqa (%rdi), %ymm1
; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2
; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
+; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm1[1,0,0,3]
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -1245,8 +1185,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 x
; CHECK-NEXT: vmovdqa (%rdi), %ymm2
; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3
; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %xmm3, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = xmm2[0,3,3,0]
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -1263,8 +1202,7 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4
; CHECK-NEXT: vmovdqa (%rdi), %ymm1
; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2
; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
+; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm1[0,3,3,0]
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -1297,8 +1235,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x
; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %xmm3, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -1317,8 +1254,7 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4
; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
+; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -1346,8 +1282,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [9,5,3,6,15,2,9,14]
; CHECK-NEXT: vpermi2d %ymm0, %ymm3, %ymm4
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpcmpeqd %ymm0, %ymm2, %k1
+; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmd %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
@@ -1361,8 +1296,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [9,5,3,6,15,2,9,14]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqd %ymm4, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermi2d %ymm0, %ymm3, %ymm2 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
@@ -1377,8 +1311,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [3,0,15,3,2,3,6,8]
; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpcmpeqd %ymm0, %ymm2, %k1
+; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmd %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 3, i32 0, i32 15, i32 3, i32 2, i32 3, i32 6, i32 8>
@@ -1392,8 +1325,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,15,3,2,3,6,8]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqd %ymm4, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm2 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
@@ -1408,8 +1340,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,15,15,2,6,10,14,7]
; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpcmpeqd %ymm0, %ymm2, %k1
+; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmd %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7>
@@ -1423,8 +1354,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,15,15,2,6,10,14,7]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqd %ymm4, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm2 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
@@ -1450,8 +1380,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [14,5,7,7,10,3,9,3]
; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpcmpeqd %ymm0, %ymm2, %k1
+; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmd %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
@@ -1465,8 +1394,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [14,5,7,7,10,3,9,3]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqd %ymm4, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm2 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
@@ -1493,8 +1421,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,12,4,6,4,12]
; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpcmpeqd %xmm0, %xmm2, %k1
+; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmd %xmm4, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -1510,8 +1437,7 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,12,4,6,4,12]
; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm3
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpcmpeqd %xmm0, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -1526,8 +1452,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <5,1,3,4,u,u,u,u>
; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -1543,8 +1468,7 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <5,1,3,4,u,u,u,u>
; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -1559,8 +1483,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <1,1,13,0,u,u,u,u>
; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpcmpeqd %xmm0, %xmm2, %k1
+; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmd %xmm4, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -1576,8 +1499,7 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <1,1,13,0,u,u,u,u>
; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm3
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpcmpeqd %xmm0, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -1604,8 +1526,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <3,0,0,13,u,u,u,u>
; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpcmpeqd %xmm0, %xmm2, %k1
+; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmd %xmm4, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -1621,8 +1542,7 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <3,0,0,13,u,u,u,u>
; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm3
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpcmpeqd %xmm0, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -1645,8 +1565,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp, <8
; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [7,0,6,0,1,2,4,4]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermd 32(%rdi), %ymm2, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -1660,8 +1579,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp,
; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,0,6,0,1,2,4,4]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
+; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermd 32(%rdi), %ymm1, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -1678,8 +1596,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask1(<16 x i32>* %vp, <8
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [7,3,6,11,0,1,5,15]
; CHECK-NEXT: vpermi2d %ymm2, %ymm3, %ymm4
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vmovdqa32 %ymm4, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -1695,8 +1612,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask1(<16 x i32>* %vp,
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,3,6,11,0,1,5,15]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqd %ymm4, %ymm0, %k1
+; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermi2d %ymm2, %ymm3, %ymm1 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -1714,8 +1630,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask2(<16 x i32>* %vp, <8
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [4,14,1,5,4,2,8,10]
; CHECK-NEXT: vpermi2d %ymm2, %ymm3, %ymm4
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vmovdqa32 %ymm4, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -1731,8 +1646,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask2(<16 x i32>* %vp,
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,1,5,4,2,8,10]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqd %ymm4, %ymm0, %k1
+; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermi2d %ymm2, %ymm3, %ymm1 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -1762,8 +1676,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp, <8
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [8,4,1,13,15,4,6,12]
; CHECK-NEXT: vpermi2d %ymm3, %ymm2, %ymm4
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vmovdqa32 %ymm4, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -1779,8 +1692,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp,
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [8,4,1,13,15,4,6,12]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqd %ymm4, %ymm0, %k1
+; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermi2d %ymm3, %ymm2, %ymm1 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -1812,8 +1724,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp, <4
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <13,0,0,6,u,u,u,u>
; CHECK-NEXT: vpermi2d %ymm3, %ymm2, %ymm4
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm4, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -1831,8 +1742,7 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp,
; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <13,0,0,6,u,u,u,u>
; CHECK-NEXT: vpermi2d %ymm2, %ymm1, %ymm3
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -1850,8 +1760,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp, <4
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [15,5,3,2,15,5,7,6]
; CHECK-NEXT: vpermi2d %ymm2, %ymm3, %ymm4
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm4, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -1869,8 +1778,7 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp,
; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [15,5,3,2,15,5,7,6]
; CHECK-NEXT: vpermi2d %ymm1, %ymm2, %ymm3
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -1888,8 +1796,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp, <4
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <2,15,6,9,u,u,u,u>
; CHECK-NEXT: vpermi2d %ymm3, %ymm2, %ymm4
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm4, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -1907,8 +1814,7 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp,
; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <2,15,6,9,u,u,u,u>
; CHECK-NEXT: vpermi2d %ymm2, %ymm1, %ymm3
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -1949,8 +1855,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4
; CHECK-NEXT: vpinsrd $2, %eax, %xmm4, %xmm3
; CHECK-NEXT: vpextrd $2, %xmm2, %eax
; CHECK-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %xmm3, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -1973,8 +1878,7 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp,
; CHECK-NEXT: vpinsrd $2, %eax, %xmm3, %xmm2
; CHECK-NEXT: vpextrd $2, %xmm1, %eax
; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
+; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -1999,8 +1903,7 @@ define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64
; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqq %xmm4, %xmm2, %k1
+; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm0[0]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
@@ -2015,8 +1918,7 @@ define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i
; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %xmm3, %xmm1, %k1
+; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -2030,8 +1932,7 @@ define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %xmm3, %xmm2, %k1
+; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -2046,8 +1947,7 @@ define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -2073,8 +1973,7 @@ define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp, <2 x
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm2
; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqq %xmm4, %xmm1, %k1
+; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm3[1]
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -2090,8 +1989,7 @@ define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp, <2
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm1
; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %xmm3, %xmm0, %k1
+; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm2[1]
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -2108,8 +2006,7 @@ define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask1(<4 x i64>* %vp, <2 x
; CHECK-NEXT: vmovdqa (%rdi), %ymm2
; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3
; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %xmm3, %xmm1, %k1
+; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -2126,8 +2023,7 @@ define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask1(<4 x i64>* %vp, <2
; CHECK-NEXT: vmovdqa (%rdi), %ymm1
; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2
; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %xmm2, %xmm0, %k1
+; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -2151,8 +2047,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64
; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,1]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -2166,8 +2061,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i
; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,1]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
@@ -2181,8 +2075,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,0,2,5]
; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1
+; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1>
@@ -2196,8 +2089,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,0,2,5]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
@@ -2212,8 +2104,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,7,2,7]
; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1
+; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3>
@@ -2227,8 +2118,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,7,2,7]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
@@ -2254,8 +2144,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,4,4,3]
; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1
+; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
@@ -2269,8 +2158,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,4,4,3]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
@@ -2285,8 +2173,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [7,3,3,1]
; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1
+; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>
@@ -2300,8 +2187,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [7,3,3,1]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
@@ -2316,8 +2202,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [4,1,0,6]
; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm4
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1
+; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6>
@@ -2331,8 +2216,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [4,1,0,6]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm2 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
@@ -2358,8 +2242,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [3,2,1,7]
; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1
+; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
@@ -2373,8 +2256,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,7]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
@@ -2389,8 +2271,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,0,3,4]
; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm4
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1
+; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
@@ -2404,8 +2285,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,0,3,4]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm2 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
@@ -2430,8 +2310,7 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqq %xmm4, %xmm2, %k1
+; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm0[0]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
@@ -2447,8 +2326,7 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %xmm3, %xmm1, %k1
+; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -2462,8 +2340,7 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %xmm3, %xmm2, %k1
+; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -2478,8 +2355,7 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -2500,8 +2376,7 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp) {
define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[0,2,0,2]
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2514,8 +2389,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp, <4 x
define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,2,0,2]
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2532,8 +2406,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(<8 x i64>* %vp, <4 x
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [4,3,2,4]
; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm4
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2549,8 +2422,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(<8 x i64>* %vp, <4
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,4]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1
+; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm1 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -2568,8 +2440,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [3,5,5,1]
; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm4
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2585,8 +2456,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,5,5,1]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1
+; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm1 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -2616,8 +2486,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp, <4 x
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [7,0,0,2]
; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm4
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2633,8 +2502,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp, <4
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,0,0,2]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1
+; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -2652,8 +2520,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [0,4,6,1]
; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm4
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2669,8 +2536,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,6,1]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1
+; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -2688,8 +2554,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4 x
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,7,1]
; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm4
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2705,8 +2570,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,7,1]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1
+; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -2736,8 +2600,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [7,2,3,2]
; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm4
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2753,8 +2616,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,2,3,2]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1
+; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -2772,8 +2634,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(<8 x i64>* %vp, <4 x
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [3,3,1,5]
; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm4
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2789,8 +2650,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(<8 x i64>* %vp, <4
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,1,5]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1
+; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm1 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -2820,8 +2680,7 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp, <2 x
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
; CHECK-NEXT: vextracti32x4 $2, %zmm2, %xmm3
; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqq %xmm4, %xmm1, %k1
+; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm2[0]
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -2838,8 +2697,7 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp, <2
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
; CHECK-NEXT: vextracti32x4 $2, %zmm1, %xmm2
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %xmm3, %xmm0, %k1
+; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm1[0]
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -2857,8 +2715,7 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(<8 x i64>* %vp, <2 x
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %xmm3, %xmm1, %k1
+; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -2876,8 +2733,7 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask1(<8 x i64>* %vp, <2
; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %xmm2, %xmm0, %k1
+; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/permute.ll
index ac619279aed..5be6ab87461 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/permute.ll
@@ -14,8 +14,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %ve
; CHECK-LABEL: test_masked_16xi16_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqw %ymm4, %ymm2, %k1
+; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1}
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -29,8 +28,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %
; CHECK-LABEL: test_masked_z_16xi16_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %ymm3, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
@@ -42,8 +40,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %ve
; CHECK-LABEL: test_masked_16xi16_perm_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqw %ymm4, %ymm2, %k1
+; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1}
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -57,8 +54,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %
; CHECK-LABEL: test_masked_z_16xi16_perm_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %ymm3, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
@@ -70,8 +66,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %ve
; CHECK-LABEL: test_masked_16xi16_perm_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqw %ymm4, %ymm2, %k1
+; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1}
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -85,8 +80,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %
; CHECK-LABEL: test_masked_z_16xi16_perm_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %ymm3, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
@@ -107,8 +101,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %ve
; CHECK-LABEL: test_masked_16xi16_perm_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqw %ymm4, %ymm2, %k1
+; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1}
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -122,8 +115,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %
; CHECK-LABEL: test_masked_z_16xi16_perm_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %ymm3, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
@@ -145,8 +137,7 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i16>
; CHECK-LABEL: test_masked_16xi16_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %ymm3, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -160,8 +151,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i1
; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %ymm2, %ymm0, %k1
+; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -175,8 +165,7 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i16>
; CHECK-LABEL: test_masked_16xi16_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %ymm3, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -190,8 +179,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i1
; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %ymm2, %ymm0, %k1
+; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -205,8 +193,7 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i16>
; CHECK-LABEL: test_masked_16xi16_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %ymm3, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -220,8 +207,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i1
; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %ymm2, %ymm0, %k1
+; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -245,8 +231,7 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i16>
; CHECK-LABEL: test_masked_16xi16_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %ymm3, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -260,8 +245,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i1
; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %ymm2, %ymm0, %k1
+; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -284,8 +268,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %ve
; CHECK-LABEL: test_masked_32xi16_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqw %zmm4, %zmm2, %k1
+; CHECK-NEXT: vptestnmw %zmm2, %zmm2, %k1
; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -299,8 +282,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %
; CHECK-LABEL: test_masked_z_32xi16_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
@@ -312,8 +294,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %ve
; CHECK-LABEL: test_masked_32xi16_perm_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqw %zmm4, %zmm2, %k1
+; CHECK-NEXT: vptestnmw %zmm2, %zmm2, %k1
; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -327,8 +308,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %
; CHECK-LABEL: test_masked_z_32xi16_perm_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 8, i32 7, i32 30, i32 11, i32 9, i32 11, i32 30, i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12, i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16>
@@ -340,8 +320,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %ve
; CHECK-LABEL: test_masked_32xi16_perm_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqw %zmm4, %zmm2, %k1
+; CHECK-NEXT: vptestnmw %zmm2, %zmm2, %k1
; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -355,8 +334,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %
; CHECK-LABEL: test_masked_z_32xi16_perm_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25, i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16, i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27>
@@ -377,8 +355,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %ve
; CHECK-LABEL: test_masked_32xi16_perm_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqw %zmm4, %zmm2, %k1
+; CHECK-NEXT: vptestnmw %zmm2, %zmm2, %k1
; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -392,8 +369,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %
; CHECK-LABEL: test_masked_z_32xi16_perm_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
@@ -415,8 +391,7 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16>
; CHECK-LABEL: test_masked_32xi16_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -430,8 +405,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i1
; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %zmm2, %zmm0, %k1
+; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1
; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -445,8 +419,7 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16>
; CHECK-LABEL: test_masked_32xi16_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -460,8 +433,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i1
; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %zmm2, %zmm0, %k1
+; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1
; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -475,8 +447,7 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16>
; CHECK-LABEL: test_masked_32xi16_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -490,8 +461,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i1
; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %zmm2, %zmm0, %k1
+; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1
; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -515,8 +485,7 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16>
; CHECK-LABEL: test_masked_32xi16_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -530,8 +499,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i1
; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %zmm2, %zmm0, %k1
+; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1
; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -554,8 +522,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2,
; CHECK-LABEL: test_masked_8xi32_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqd %ymm4, %ymm2, %k1
+; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1}
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -569,8 +536,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask
; CHECK-LABEL: test_masked_z_8xi32_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
@@ -582,8 +548,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2,
; CHECK-LABEL: test_masked_8xi32_perm_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqd %ymm4, %ymm2, %k1
+; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1}
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -597,8 +562,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask
; CHECK-LABEL: test_masked_z_8xi32_perm_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 5, i32 1, i32 2, i32 6, i32 0, i32 0, i32 3>
@@ -610,8 +574,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2,
; CHECK-LABEL: test_masked_8xi32_perm_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqd %ymm4, %ymm2, %k1
+; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1}
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -625,8 +588,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask
; CHECK-LABEL: test_masked_z_8xi32_perm_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 6, i32 5, i32 5, i32 1, i32 7, i32 3, i32 4>
@@ -647,8 +609,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2,
; CHECK-LABEL: test_masked_8xi32_perm_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqd %ymm4, %ymm2, %k1
+; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1}
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -662,8 +623,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask
; CHECK-LABEL: test_masked_z_8xi32_perm_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
@@ -685,8 +645,7 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %ve
; CHECK-LABEL: test_masked_8xi32_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -700,8 +659,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %
; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
+; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -715,8 +673,7 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %ve
; CHECK-LABEL: test_masked_8xi32_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -730,8 +687,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %
; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
+; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -745,8 +701,7 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %ve
; CHECK-LABEL: test_masked_8xi32_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -760,8 +715,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %
; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
+; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -785,8 +739,7 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %ve
; CHECK-LABEL: test_masked_8xi32_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -800,8 +753,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %
; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
+; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -824,8 +776,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %ve
; CHECK-LABEL: test_masked_16xi32_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqd %zmm4, %zmm2, %k1
+; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1
; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -839,8 +790,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %
; CHECK-LABEL: test_masked_z_16xi32_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
@@ -852,8 +802,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %ve
; CHECK-LABEL: test_masked_16xi32_perm_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqd %zmm4, %zmm2, %k1
+; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1
; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -867,8 +816,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %
; CHECK-LABEL: test_masked_z_16xi32_perm_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 10, i32 0, i32 14, i32 15, i32 11, i32 1, i32 1, i32 5, i32 0, i32 5, i32 0, i32 15, i32 13, i32 1, i32 14, i32 3>
@@ -880,8 +828,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %ve
; CHECK-LABEL: test_masked_16xi32_perm_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqd %zmm4, %zmm2, %k1
+; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1
; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -895,8 +842,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %
; CHECK-LABEL: test_masked_z_16xi32_perm_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 10, i32 15, i32 1, i32 0, i32 5, i32 0, i32 9, i32 13, i32 2, i32 1, i32 5, i32 15, i32 2, i32 15, i32 5>
@@ -917,8 +863,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %ve
; CHECK-LABEL: test_masked_16xi32_perm_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqd %zmm4, %zmm2, %k1
+; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1
; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -932,8 +877,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %
; CHECK-LABEL: test_masked_z_16xi32_perm_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
@@ -955,8 +899,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32>
; CHECK-LABEL: test_masked_16xi32_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -970,8 +913,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i3
; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
+; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -985,8 +927,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32>
; CHECK-LABEL: test_masked_16xi32_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -1000,8 +941,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i3
; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
+; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -1015,8 +955,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32>
; CHECK-LABEL: test_masked_16xi32_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -1030,8 +969,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i3
; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
+; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -1055,8 +993,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32>
; CHECK-LABEL: test_masked_16xi32_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -1070,8 +1007,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i3
; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
+; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -1092,8 +1028,7 @@ define <4 x i64> @test_4xi64_perm_mask0(<4 x i64> %vec) {
define <4 x i64> @test_masked_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_4xi64_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -1106,8 +1041,7 @@ define <4 x i64> @test_masked_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %vec2,
define <4 x i64> @test_masked_z_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_4xi64_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
@@ -1118,8 +1052,7 @@ define <4 x i64> @test_masked_z_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %mask
define <4 x i64> @test_masked_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_4xi64_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -1132,8 +1065,7 @@ define <4 x i64> @test_masked_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %vec2,
define <4 x i64> @test_masked_z_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_4xi64_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
@@ -1144,8 +1076,7 @@ define <4 x i64> @test_masked_z_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %mask
define <4 x i64> @test_masked_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_4xi64_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -1158,8 +1089,7 @@ define <4 x i64> @test_masked_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %vec2,
define <4 x i64> @test_masked_z_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_4xi64_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 1>
@@ -1178,8 +1108,7 @@ define <4 x i64> @test_4xi64_perm_mask3(<4 x i64> %vec) {
define <4 x i64> @test_masked_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_4xi64_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -1192,8 +1121,7 @@ define <4 x i64> @test_masked_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %vec2,
define <4 x i64> @test_masked_z_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_4xi64_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
@@ -1213,8 +1141,7 @@ define <4 x i64> @test_4xi64_perm_mem_mask0(<4 x i64>* %vp) {
define <4 x i64> @test_masked_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_4xi64_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0]
; CHECK-NEXT: retq
%vec = load <4 x i64>, <4 x i64>* %vp
@@ -1227,8 +1154,7 @@ define <4 x i64> @test_masked_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %ve
define <4 x i64> @test_masked_z_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0]
; CHECK-NEXT: retq
%vec = load <4 x i64>, <4 x i64>* %vp
@@ -1241,8 +1167,7 @@ define <4 x i64> @test_masked_z_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %
define <4 x i64> @test_masked_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_4xi64_perm_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1]
; CHECK-NEXT: retq
%vec = load <4 x i64>, <4 x i64>* %vp
@@ -1255,8 +1180,7 @@ define <4 x i64> @test_masked_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %ve
define <4 x i64> @test_masked_z_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1]
; CHECK-NEXT: retq
%vec = load <4 x i64>, <4 x i64>* %vp
@@ -1269,8 +1193,7 @@ define <4 x i64> @test_masked_z_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %
define <4 x i64> @test_masked_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_4xi64_perm_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0]
; CHECK-NEXT: retq
%vec = load <4 x i64>, <4 x i64>* %vp
@@ -1283,8 +1206,7 @@ define <4 x i64> @test_masked_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %ve
define <4 x i64> @test_masked_z_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0]
; CHECK-NEXT: retq
%vec = load <4 x i64>, <4 x i64>* %vp
@@ -1306,8 +1228,7 @@ define <4 x i64> @test_4xi64_perm_mem_mask3(<4 x i64>* %vp) {
define <4 x i64> @test_masked_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_4xi64_perm_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3]
; CHECK-NEXT: retq
%vec = load <4 x i64>, <4 x i64>* %vp
@@ -1320,8 +1241,7 @@ define <4 x i64> @test_masked_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %ve
define <4 x i64> @test_masked_z_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3]
; CHECK-NEXT: retq
%vec = load <4 x i64>, <4 x i64>* %vp
@@ -1344,8 +1264,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %vec2,
; CHECK-LABEL: test_masked_8xi64_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqq %zmm4, %zmm2, %k1
+; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1
; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -1359,8 +1278,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %mask
; CHECK-LABEL: test_masked_z_8xi64_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
@@ -1371,8 +1289,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %mask
define <8 x i64> @test_masked_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_perm_imm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1
; CHECK-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -1385,8 +1302,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %ve
define <8 x i64> @test_masked_z_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 1, i32 5, i32 4, i32 5, i32 5>
@@ -1398,8 +1314,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %vec2,
; CHECK-LABEL: test_masked_8xi64_perm_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqq %zmm4, %zmm2, %k1
+; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1
; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -1413,8 +1328,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %mask
; CHECK-LABEL: test_masked_z_8xi64_perm_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 3, i32 3, i32 5, i32 4, i32 1>
@@ -1433,8 +1347,7 @@ define <8 x i64> @test_8xi64_perm_imm_mask3(<8 x i64> %vec) {
define <8 x i64> @test_masked_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_perm_imm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1
; CHECK-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -1447,8 +1360,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %ve
define <8 x i64> @test_masked_z_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
@@ -1460,8 +1372,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %vec2,
; CHECK-LABEL: test_masked_8xi64_perm_mask4:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqq %zmm4, %zmm2, %k1
+; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1
; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -1475,8 +1386,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %mask
; CHECK-LABEL: test_masked_z_8xi64_perm_mask4:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 6, i32 3, i32 1, i32 1, i32 7, i32 4, i32 0, i32 3>
@@ -1487,8 +1397,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %mask
define <8 x i64> @test_masked_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_perm_imm_mask5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1
; CHECK-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -1501,8 +1410,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %ve
define <8 x i64> @test_masked_z_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
@@ -1523,8 +1431,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %vec2,
; CHECK-LABEL: test_masked_8xi64_perm_mask6:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7]
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqq %zmm4, %zmm2, %k1
+; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1
; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -1538,8 +1445,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %mask
; CHECK-LABEL: test_masked_z_8xi64_perm_mask6:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
@@ -1550,8 +1456,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %mask
define <8 x i64> @test_masked_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_perm_imm_mask7:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1
; CHECK-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -1564,8 +1469,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %ve
define <8 x i64> @test_masked_z_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask7:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
@@ -1587,8 +1491,7 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %ve
; CHECK-LABEL: test_masked_8xi64_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -1602,8 +1505,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %
; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
+; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -1616,8 +1518,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %
define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4]
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -1630,8 +1531,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64>
define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4]
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -1645,8 +1545,7 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %ve
; CHECK-LABEL: test_masked_8xi64_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -1660,8 +1559,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %
; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
+; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -1683,8 +1581,7 @@ define <8 x i64> @test_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp) {
define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5]
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -1697,8 +1594,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64>
define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5]
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -1712,8 +1608,7 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %ve
; CHECK-LABEL: test_masked_8xi64_perm_mem_mask4:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -1727,8 +1622,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %
; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask4:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
+; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -1741,8 +1635,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %
define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4]
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -1755,8 +1648,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64>
define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4]
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -1780,8 +1672,7 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %ve
; CHECK-LABEL: test_masked_8xi64_perm_mem_mask6:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -1795,8 +1686,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %
; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask6:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
+; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -1809,8 +1699,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %
define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask7:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5]
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -1823,8 +1712,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64>
define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5]
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll
index 1896356dafa..d4f12747028 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll
@@ -1030,8 +1030,7 @@ define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) {
define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_masked_shuff_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vptestnmd %ymm3, %ymm3, %k1
; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
@@ -1044,8 +1043,7 @@ define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -1056,8 +1054,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32>
define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_masked_shuff_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vptestnmd %ymm3, %ymm3, %k1
; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
@@ -1070,8 +1067,7 @@ define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -1082,8 +1078,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32>
define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_masked_shuff_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vptestnmd %ymm3, %ymm3, %k1
; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
@@ -1096,8 +1091,7 @@ define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -1116,8 +1110,7 @@ define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) {
define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_masked_shuff_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vptestnmd %ymm3, %ymm3, %k1
; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
@@ -1130,8 +1123,7 @@ define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -1151,8 +1143,7 @@ define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p)
define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -1166,8 +1157,7 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7]
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
@@ -1180,8 +1170,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i
define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -1195,8 +1184,7 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
@@ -1209,8 +1197,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i
define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -1224,8 +1211,7 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
@@ -1247,8 +1233,7 @@ define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p)
define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -1262,8 +1247,7 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
@@ -1284,8 +1268,7 @@ define <16 x i32> @test_16xi32_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2) {
define <16 x i32> @test_16xi32_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
; CHECK-LABEL: test_16xi32_masked_shuff_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqd %zmm4, %zmm3, %k1
+; CHECK-NEXT: vptestnmd %zmm3, %zmm3, %k1
; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15]
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -1298,8 +1281,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %
define <16 x i32> @test_16xi32_zero_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1
; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15]
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
@@ -1310,8 +1292,7 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mask0(<16 x i32> %vec1, <16 x i
define <16 x i32> @test_16xi32_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
; CHECK-LABEL: test_16xi32_masked_shuff_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqd %zmm4, %zmm3, %k1
+; CHECK-NEXT: vptestnmd %zmm3, %zmm3, %k1
; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7]
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -1324,8 +1305,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %
define <16 x i32> @test_16xi32_zero_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1
; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7]
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
@@ -1336,8 +1316,7 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mask1(<16 x i32> %vec1, <16 x i
define <16 x i32> @test_16xi32_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
; CHECK-LABEL: test_16xi32_masked_shuff_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqd %zmm4, %zmm3, %k1
+; CHECK-NEXT: vptestnmd %zmm3, %zmm3, %k1
; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3]
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -1350,8 +1329,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %
define <16 x i32> @test_16xi32_zero_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1
; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
@@ -1370,8 +1348,7 @@ define <16 x i32> @test_16xi32_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2) {
define <16 x i32> @test_16xi32_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
; CHECK-LABEL: test_16xi32_masked_shuff_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqd %zmm4, %zmm3, %k1
+; CHECK-NEXT: vptestnmd %zmm3, %zmm3, %k1
; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7]
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -1384,8 +1361,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %
define <16 x i32> @test_16xi32_zero_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1
; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7]
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
@@ -1405,8 +1381,7 @@ define <16 x i32> @test_16xi32_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %ve
define <16 x i32> @test_16xi32_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1
; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -1420,8 +1395,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i3
define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3]
; CHECK-NEXT: retq
%vec2 = load <16 x i32>, <16 x i32>* %vec2p
@@ -1434,8 +1408,7 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask0(<16 x i32> %vec1, <16
define <16 x i32> @test_16xi32_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1
; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -1449,8 +1422,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i3
define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11]
; CHECK-NEXT: retq
%vec2 = load <16 x i32>, <16 x i32>* %vec2p
@@ -1463,8 +1435,7 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask1(<16 x i32> %vec1, <16
define <16 x i32> @test_16xi32_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1
; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -1478,8 +1449,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i3
define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15]
; CHECK-NEXT: retq
%vec2 = load <16 x i32>, <16 x i32>* %vec2p
@@ -1501,8 +1471,7 @@ define <16 x i32> @test_16xi32_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %ve
define <16 x i32> @test_16xi32_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1
; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -1516,8 +1485,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i3
define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15]
; CHECK-NEXT: retq
%vec2 = load <16 x i32>, <16 x i32>* %vec2p
@@ -1538,8 +1506,7 @@ define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) {
define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_masked_shuff_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; CHECK-NEXT: vptestnmq %ymm3, %ymm3, %k1
; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
@@ -1552,8 +1519,7 @@ define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -1564,8 +1530,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64>
define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_masked_shuff_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; CHECK-NEXT: vptestnmq %ymm3, %ymm3, %k1
; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
@@ -1578,8 +1543,7 @@ define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -1590,8 +1554,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64>
define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_masked_shuff_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; CHECK-NEXT: vptestnmq %ymm3, %ymm3, %k1
; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
@@ -1604,8 +1567,7 @@ define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -1624,8 +1586,7 @@ define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) {
define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_masked_shuff_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; CHECK-NEXT: vptestnmq %ymm3, %ymm3, %k1
; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
@@ -1638,8 +1599,7 @@ define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -1659,8 +1619,7 @@ define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p)
define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -1674,8 +1633,7 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
@@ -1688,8 +1646,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i
define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -1703,8 +1660,7 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
@@ -1717,8 +1673,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i
define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -1732,8 +1687,7 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
@@ -1755,8 +1709,7 @@ define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p)
define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -1770,8 +1723,7 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
@@ -1792,8 +1744,7 @@ define <8 x i64> @test_8xi64_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2) {
define <8 x i64> @test_8xi64_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
; CHECK-LABEL: test_8xi64_masked_shuff_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqq %zmm4, %zmm3, %k1
+; CHECK-NEXT: vptestnmq %zmm3, %zmm3, %k1
; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5]
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -1806,8 +1757,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2
define <8 x i64> @test_8xi64_zero_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1
; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
@@ -1818,8 +1768,7 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64>
define <8 x i64> @test_8xi64_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
; CHECK-LABEL: test_8xi64_masked_shuff_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqq %zmm4, %zmm3, %k1
+; CHECK-NEXT: vptestnmq %zmm3, %zmm3, %k1
; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5]
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -1832,8 +1781,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2
define <8 x i64> @test_8xi64_zero_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1
; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 12, i32 13>
@@ -1844,8 +1792,7 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64>
define <8 x i64> @test_8xi64_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
; CHECK-LABEL: test_8xi64_masked_shuff_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqq %zmm4, %zmm3, %k1
+; CHECK-NEXT: vptestnmq %zmm3, %zmm3, %k1
; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1]
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -1858,8 +1805,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2
define <8 x i64> @test_8xi64_zero_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1
; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9>
@@ -1878,8 +1824,7 @@ define <8 x i64> @test_8xi64_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2) {
define <8 x i64> @test_8xi64_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
; CHECK-LABEL: test_8xi64_masked_shuff_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpcmpeqq %zmm4, %zmm3, %k1
+; CHECK-NEXT: vptestnmq %zmm3, %zmm3, %k1
; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3]
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -1892,8 +1837,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2
define <8 x i64> @test_8xi64_zero_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1
; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
@@ -1913,8 +1857,7 @@ define <8 x i64> @test_8xi64_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p)
define <8 x i64> @test_8xi64_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1
; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -1928,8 +1871,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>*
define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3]
; CHECK-NEXT: retq
%vec2 = load <8 x i64>, <8 x i64>* %vec2p
@@ -1942,8 +1884,7 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i
define <8 x i64> @test_8xi64_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1
; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -1957,8 +1898,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>*
define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1]
; CHECK-NEXT: retq
%vec2 = load <8 x i64>, <8 x i64>* %vec2p
@@ -1971,8 +1911,7 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i
define <8 x i64> @test_8xi64_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1
; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -1986,8 +1925,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>*
define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3]
; CHECK-NEXT: retq
%vec2 = load <8 x i64>, <8 x i64>* %vec2p
@@ -2009,8 +1947,7 @@ define <8 x i64> @test_8xi64_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p)
define <8 x i64> @test_8xi64_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1
; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -2024,8 +1961,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>*
define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1
; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3]
; CHECK-NEXT: retq
%vec2 = load <8 x i64>, <8 x i64>* %vec2p
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle.ll b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle.ll
index df46487d9ab..67de50a83a6 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle.ll
@@ -12,8 +12,7 @@ define <16 x i8> @test_16xi8_perm_mask0(<16 x i8> %vec) {
define <16 x i8> @test_masked_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
; CHECK-LABEL: test_masked_16xi8_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqb %xmm3, %xmm2, %k1
+; CHECK-NEXT: vptestnmb %xmm2, %xmm2, %k1
; CHECK-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -26,8 +25,7 @@ define <16 x i8> @test_masked_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %vec2,
define <16 x i8> @test_masked_z_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %mask) {
; CHECK-LABEL: test_masked_z_16xi8_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmb %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
@@ -38,8 +36,7 @@ define <16 x i8> @test_masked_z_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %mask
define <16 x i8> @test_masked_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
; CHECK-LABEL: test_masked_16xi8_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqb %xmm3, %xmm2, %k1
+; CHECK-NEXT: vptestnmb %xmm2, %xmm2, %k1
; CHECK-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -52,8 +49,7 @@ define <16 x i8> @test_masked_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %vec2,
define <16 x i8> @test_masked_z_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %mask) {
; CHECK-LABEL: test_masked_z_16xi8_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmb %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0]
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
@@ -64,8 +60,7 @@ define <16 x i8> @test_masked_z_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %mask
define <16 x i8> @test_masked_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
; CHECK-LABEL: test_masked_16xi8_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqb %xmm3, %xmm2, %k1
+; CHECK-NEXT: vptestnmb %xmm2, %xmm2, %k1
; CHECK-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -78,8 +73,7 @@ define <16 x i8> @test_masked_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %vec2,
define <16 x i8> @test_masked_z_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %mask) {
; CHECK-LABEL: test_masked_z_16xi8_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmb %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7]
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
@@ -98,8 +92,7 @@ define <16 x i8> @test_16xi8_perm_mask3(<16 x i8> %vec) {
define <16 x i8> @test_masked_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
; CHECK-LABEL: test_masked_16xi8_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqb %xmm3, %xmm2, %k1
+; CHECK-NEXT: vptestnmb %xmm2, %xmm2, %k1
; CHECK-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -112,8 +105,7 @@ define <16 x i8> @test_masked_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %vec2,
define <16 x i8> @test_masked_z_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %mask) {
; CHECK-LABEL: test_masked_z_16xi8_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmb %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
@@ -135,8 +127,7 @@ define <16 x i8> @test_masked_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> %ve
; CHECK-LABEL: test_masked_16xi8_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm2
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqb %xmm3, %xmm1, %k1
+; CHECK-NEXT: vptestnmb %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
; CHECK-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %vp
@@ -150,8 +141,7 @@ define <16 x i8> @test_masked_z_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> %
; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm1
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %xmm2, %xmm0, %k1
+; CHECK-NEXT: vptestnmb %xmm0, %xmm0, %k1
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
; CHECK-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %vp
@@ -165,8 +155,7 @@ define <16 x i8> @test_masked_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> %ve
; CHECK-LABEL: test_masked_16xi8_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm2
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqb %xmm3, %xmm1, %k1
+; CHECK-NEXT: vptestnmb %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11]
; CHECK-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %vp
@@ -180,8 +169,7 @@ define <16 x i8> @test_masked_z_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> %
; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm1
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %xmm2, %xmm0, %k1
+; CHECK-NEXT: vptestnmb %xmm0, %xmm0, %k1
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11]
; CHECK-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %vp
@@ -195,8 +183,7 @@ define <16 x i8> @test_masked_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> %ve
; CHECK-LABEL: test_masked_16xi8_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm2
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqb %xmm3, %xmm1, %k1
+; CHECK-NEXT: vptestnmb %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9]
; CHECK-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %vp
@@ -210,8 +197,7 @@ define <16 x i8> @test_masked_z_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> %
; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm1
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %xmm2, %xmm0, %k1
+; CHECK-NEXT: vptestnmb %xmm0, %xmm0, %k1
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9]
; CHECK-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %vp
@@ -235,8 +221,7 @@ define <16 x i8> @test_masked_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> %ve
; CHECK-LABEL: test_masked_16xi8_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm2
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqb %xmm3, %xmm1, %k1
+; CHECK-NEXT: vptestnmb %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
; CHECK-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %vp
@@ -250,8 +235,7 @@ define <16 x i8> @test_masked_z_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> %
; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm1
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %xmm2, %xmm0, %k1
+; CHECK-NEXT: vptestnmb %xmm0, %xmm0, %k1
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
; CHECK-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %vp
@@ -272,8 +256,7 @@ define <32 x i8> @test_32xi8_perm_mask0(<32 x i8> %vec) {
define <32 x i8> @test_masked_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
; CHECK-LABEL: test_masked_32xi8_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqb %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmb %ymm2, %ymm2, %k1
; CHECK-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -286,8 +269,7 @@ define <32 x i8> @test_masked_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %vec2,
define <32 x i8> @test_masked_z_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %mask) {
; CHECK-LABEL: test_masked_z_32xi8_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k1
; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21]
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
@@ -298,8 +280,7 @@ define <32 x i8> @test_masked_z_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %mask
define <32 x i8> @test_masked_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
; CHECK-LABEL: test_masked_32xi8_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqb %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmb %ymm2, %ymm2, %k1
; CHECK-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -312,8 +293,7 @@ define <32 x i8> @test_masked_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %vec2,
define <32 x i8> @test_masked_z_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %mask) {
; CHECK-LABEL: test_masked_z_32xi8_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k1
; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24]
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 0, i32 4, i32 3, i32 15, i32 5, i32 4, i32 5, i32 15, i32 10, i32 9, i32 11, i32 6, i32 6, i32 10, i32 0, i32 3, i32 21, i32 19, i32 26, i32 22, i32 30, i32 25, i32 22, i32 22, i32 27, i32 22, i32 26, i32 16, i32 23, i32 20, i32 18, i32 24>
@@ -324,8 +304,7 @@ define <32 x i8> @test_masked_z_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %mask
define <32 x i8> @test_masked_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
; CHECK-LABEL: test_masked_32xi8_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqb %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmb %ymm2, %ymm2, %k1
; CHECK-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -338,8 +317,7 @@ define <32 x i8> @test_masked_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %vec2,
define <32 x i8> @test_masked_z_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %mask) {
; CHECK-LABEL: test_masked_z_32xi8_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k1
; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29]
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 7, i32 8, i32 12, i32 14, i32 7, i32 4, i32 7, i32 12, i32 14, i32 12, i32 3, i32 15, i32 10, i32 1, i32 11, i32 15, i32 22, i32 26, i32 21, i32 19, i32 27, i32 16, i32 29, i32 24, i32 17, i32 17, i32 26, i32 29, i32 20, i32 31, i32 17, i32 29>
@@ -358,8 +336,7 @@ define <32 x i8> @test_32xi8_perm_mask3(<32 x i8> %vec) {
define <32 x i8> @test_masked_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
; CHECK-LABEL: test_masked_32xi8_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqb %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmb %ymm2, %ymm2, %k1
; CHECK-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -372,8 +349,7 @@ define <32 x i8> @test_masked_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %vec2,
define <32 x i8> @test_masked_z_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %mask) {
; CHECK-LABEL: test_masked_z_32xi8_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k1
; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18]
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
@@ -395,8 +371,7 @@ define <32 x i8> @test_masked_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> %ve
; CHECK-LABEL: test_masked_32xi8_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm2
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqb %ymm3, %ymm1, %k1
+; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k1
; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22]
; CHECK-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %vp
@@ -410,8 +385,7 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> %
; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm1
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %ymm2, %ymm0, %k1
+; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k1
; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22]
; CHECK-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %vp
@@ -425,8 +399,7 @@ define <32 x i8> @test_masked_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> %ve
; CHECK-LABEL: test_masked_32xi8_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm2
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqb %ymm3, %ymm1, %k1
+; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k1
; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19]
; CHECK-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %vp
@@ -440,8 +413,7 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> %
; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm1
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %ymm2, %ymm0, %k1
+; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k1
; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19]
; CHECK-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %vp
@@ -455,8 +427,7 @@ define <32 x i8> @test_masked_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> %ve
; CHECK-LABEL: test_masked_32xi8_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm2
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqb %ymm3, %ymm1, %k1
+; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k1
; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28]
; CHECK-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %vp
@@ -470,8 +441,7 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> %
; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm1
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %ymm2, %ymm0, %k1
+; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k1
; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28]
; CHECK-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %vp
@@ -495,8 +465,7 @@ define <32 x i8> @test_masked_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> %ve
; CHECK-LABEL: test_masked_32xi8_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm2
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqb %ymm3, %ymm1, %k1
+; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k1
; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29]
; CHECK-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %vp
@@ -510,8 +479,7 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> %
; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm1
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %ymm2, %ymm0, %k1
+; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k1
; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29]
; CHECK-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %vp
@@ -532,8 +500,7 @@ define <64 x i8> @test_64xi8_perm_mask0(<64 x i8> %vec) {
define <64 x i8> @test_masked_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
; CHECK-LABEL: test_masked_64xi8_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqb %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmb %zmm2, %zmm2, %k1
; CHECK-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -546,8 +513,7 @@ define <64 x i8> @test_masked_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %vec2,
define <64 x i8> @test_masked_z_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %mask) {
; CHECK-LABEL: test_masked_z_64xi8_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmb %zmm1, %zmm1, %k1
; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62]
; CHECK-NEXT: retq
%shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
@@ -558,8 +524,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %mask
define <64 x i8> @test_masked_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
; CHECK-LABEL: test_masked_64xi8_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqb %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmb %zmm2, %zmm2, %k1
; CHECK-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -572,8 +537,7 @@ define <64 x i8> @test_masked_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %vec2,
define <64 x i8> @test_masked_z_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %mask) {
; CHECK-LABEL: test_masked_z_64xi8_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmb %zmm1, %zmm1, %k1
; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49]
; CHECK-NEXT: retq
%shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 7, i32 14, i32 15, i32 10, i32 9, i32 3, i32 1, i32 13, i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 30, i32 30, i32 22, i32 17, i32 28, i32 27, i32 16, i32 23, i32 26, i32 16, i32 30, i32 31, i32 27, i32 17, i32 17, i32 21, i32 32, i32 37, i32 32, i32 47, i32 45, i32 33, i32 46, i32 35, i32 35, i32 42, i32 47, i32 33, i32 32, i32 37, i32 32, i32 41, i32 61, i32 50, i32 49, i32 53, i32 63, i32 50, i32 63, i32 53, i32 55, i32 52, i32 62, i32 63, i32 58, i32 50, i32 63, i32 49>
@@ -584,8 +548,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %mask
define <64 x i8> @test_masked_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
; CHECK-LABEL: test_masked_64xi8_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqb %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmb %zmm2, %zmm2, %k1
; CHECK-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -598,8 +561,7 @@ define <64 x i8> @test_masked_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %vec2,
define <64 x i8> @test_masked_z_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %mask) {
; CHECK-LABEL: test_masked_z_64xi8_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmb %zmm1, %zmm1, %k1
; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60]
; CHECK-NEXT: retq
%shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12, i32 4, i32 6, i32 0, i32 2, i32 0, i32 1, i32 1, i32 6, i32 24, i32 27, i32 18, i32 22, i32 26, i32 17, i32 23, i32 21, i32 31, i32 16, i32 22, i32 22, i32 27, i32 21, i32 19, i32 20, i32 39, i32 47, i32 44, i32 36, i32 40, i32 43, i32 44, i32 39, i32 38, i32 44, i32 38, i32 35, i32 39, i32 46, i32 34, i32 39, i32 58, i32 55, i32 51, i32 48, i32 59, i32 57, i32 48, i32 52, i32 60, i32 58, i32 56, i32 50, i32 59, i32 55, i32 58, i32 60>
@@ -618,8 +580,7 @@ define <64 x i8> @test_64xi8_perm_mask3(<64 x i8> %vec) {
define <64 x i8> @test_masked_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
; CHECK-LABEL: test_masked_64xi8_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqb %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmb %zmm2, %zmm2, %k1
; CHECK-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -632,8 +593,7 @@ define <64 x i8> @test_masked_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %vec2,
define <64 x i8> @test_masked_z_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %mask) {
; CHECK-LABEL: test_masked_z_64xi8_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmb %zmm1, %zmm1, %k1
; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61]
; CHECK-NEXT: retq
%shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
@@ -655,8 +615,7 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %ve
; CHECK-LABEL: test_masked_64xi8_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqb %zmm3, %zmm1, %k1
+; CHECK-NEXT: vptestnmb %zmm1, %zmm1, %k1
; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58]
; CHECK-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %vp
@@ -670,8 +629,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %
; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %zmm2, %zmm0, %k1
+; CHECK-NEXT: vptestnmb %zmm0, %zmm0, %k1
; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58]
; CHECK-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %vp
@@ -685,8 +643,7 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %ve
; CHECK-LABEL: test_masked_64xi8_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqb %zmm3, %zmm1, %k1
+; CHECK-NEXT: vptestnmb %zmm1, %zmm1, %k1
; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49]
; CHECK-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %vp
@@ -700,8 +657,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %
; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %zmm2, %zmm0, %k1
+; CHECK-NEXT: vptestnmb %zmm0, %zmm0, %k1
; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49]
; CHECK-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %vp
@@ -715,8 +671,7 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %ve
; CHECK-LABEL: test_masked_64xi8_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqb %zmm3, %zmm1, %k1
+; CHECK-NEXT: vptestnmb %zmm1, %zmm1, %k1
; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61]
; CHECK-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %vp
@@ -730,8 +685,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %
; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %zmm2, %zmm0, %k1
+; CHECK-NEXT: vptestnmb %zmm0, %zmm0, %k1
; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61]
; CHECK-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %vp
@@ -755,8 +709,7 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %ve
; CHECK-LABEL: test_masked_64xi8_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqb %zmm3, %zmm1, %k1
+; CHECK-NEXT: vptestnmb %zmm1, %zmm1, %k1
; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60]
; CHECK-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %vp
@@ -770,8 +723,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %
; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqb %zmm2, %zmm0, %k1
+; CHECK-NEXT: vptestnmb %zmm0, %zmm0, %k1
; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60]
; CHECK-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %vp
@@ -792,8 +744,7 @@ define <8 x i16> @test_8xi16_perm_high_mask0(<8 x i16> %vec) {
define <8 x i16> @test_masked_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_8xi16_perm_high_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -806,8 +757,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %v
define <8 x i16> @test_masked_z_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
@@ -818,8 +768,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16>
define <8 x i16> @test_masked_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_8xi16_perm_low_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -832,8 +781,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %ve
define <8 x i16> @test_masked_z_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
@@ -844,8 +792,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %
define <8 x i16> @test_masked_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_8xi16_perm_high_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -858,8 +805,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %v
define <8 x i16> @test_masked_z_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 4, i32 5>
@@ -878,8 +824,7 @@ define <8 x i16> @test_8xi16_perm_low_mask3(<8 x i16> %vec) {
define <8 x i16> @test_masked_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_8xi16_perm_low_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -892,8 +837,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %ve
define <8 x i16> @test_masked_z_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
@@ -904,8 +848,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %
define <8 x i16> @test_masked_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_8xi16_perm_high_mask4:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -918,8 +861,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %v
define <8 x i16> @test_masked_z_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask4:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 7, i32 6>
@@ -930,8 +872,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16>
define <8 x i16> @test_masked_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_8xi16_perm_low_mask5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -944,8 +885,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %ve
define <8 x i16> @test_masked_z_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 1, i32 4, i32 5, i32 6, i32 7>
@@ -964,8 +904,7 @@ define <8 x i16> @test_8xi16_perm_high_mask6(<8 x i16> %vec) {
define <8 x i16> @test_masked_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_8xi16_perm_high_mask6:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -978,8 +917,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %v
define <8 x i16> @test_masked_z_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask6:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
@@ -990,8 +928,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16>
define <8 x i16> @test_masked_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_8xi16_perm_low_mask7:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -1004,8 +941,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %ve
define <8 x i16> @test_masked_z_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask7:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
@@ -1025,8 +961,7 @@ define <8 x i16> @test_8xi16_perm_high_mem_mask0(<8 x i16>* %vp) {
define <8 x i16> @test_masked_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6]
; CHECK-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -1039,8 +974,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16
define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6]
; CHECK-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -1053,8 +987,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i
define <8 x i16> @test_masked_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7]
; CHECK-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -1067,8 +1000,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16>
define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7]
; CHECK-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -1081,8 +1013,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i1
define <8 x i16> @test_masked_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7]
; CHECK-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -1095,8 +1026,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16
define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7]
; CHECK-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -1118,8 +1048,7 @@ define <8 x i16> @test_8xi16_perm_low_mem_mask3(<8 x i16>* %vp) {
define <8 x i16> @test_masked_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7]
; CHECK-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -1132,8 +1061,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16>
define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7]
; CHECK-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -1146,8 +1074,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i1
define <8 x i16> @test_masked_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask4:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5]
; CHECK-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -1160,8 +1087,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16
define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask4:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5]
; CHECK-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -1174,8 +1100,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i
define <8 x i16> @test_masked_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7]
; CHECK-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -1188,8 +1113,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16>
define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7]
; CHECK-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -1211,8 +1135,7 @@ define <8 x i16> @test_8xi16_perm_high_mem_mask6(<8 x i16>* %vp) {
define <8 x i16> @test_masked_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask6:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4]
; CHECK-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -1225,8 +1148,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16
define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask6:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4]
; CHECK-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -1239,8 +1161,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i
define <8 x i16> @test_masked_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask7:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7]
; CHECK-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -1253,8 +1174,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16>
define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask7:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7]
; CHECK-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -1275,8 +1195,7 @@ define <16 x i16> @test_16xi16_perm_high_mask0(<16 x i16> %vec) {
define <16 x i16> @test_masked_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_perm_high_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -1289,8 +1208,7 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16
define <16 x i16> @test_masked_z_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12]
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
@@ -1301,8 +1219,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i
define <16 x i16> @test_masked_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_perm_low_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -1315,8 +1232,7 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16>
define <16 x i16> @test_masked_z_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15]
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 8, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
@@ -1327,8 +1243,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i1
define <16 x i16> @test_masked_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_perm_high_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -1341,8 +1256,7 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16
define <16 x i16> @test_masked_z_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13]
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 5, i32 5, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 13, i32 13, i32 13>
@@ -1361,8 +1275,7 @@ define <16 x i16> @test_16xi16_perm_low_mask3(<16 x i16> %vec) {
define <16 x i16> @test_masked_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_perm_low_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -1375,8 +1288,7 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16>
define <16 x i16> @test_masked_z_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15]
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
@@ -1387,8 +1299,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i1
define <16 x i16> @test_masked_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_perm_high_mask4:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -1401,8 +1312,7 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16
define <16 x i16> @test_masked_z_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask4:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15]
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 12, i32 15>
@@ -1413,8 +1323,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i
define <16 x i16> @test_masked_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_perm_low_mask5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -1427,8 +1336,7 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16>
define <16 x i16> @test_masked_z_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15]
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
@@ -1447,8 +1355,7 @@ define <16 x i16> @test_16xi16_perm_high_mask6(<16 x i16> %vec) {
define <16 x i16> @test_masked_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_perm_high_mask6:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -1461,8 +1368,7 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16
define <16 x i16> @test_masked_z_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask6:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13]
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
@@ -1473,8 +1379,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i
define <16 x i16> @test_masked_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_perm_low_mask7:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -1487,8 +1392,7 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16>
define <16 x i16> @test_masked_z_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask7:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15]
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 10, i32 12, i32 13, i32 14, i32 15>
@@ -1508,8 +1412,7 @@ define <16 x i16> @test_16xi16_perm_high_mem_mask0(<16 x i16>* %vp) {
define <16 x i16> @test_masked_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15]
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -1522,8 +1425,7 @@ define <16 x i16> @test_masked_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x
define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15]
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -1536,8 +1438,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16
define <16 x i16> @test_masked_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15]
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -1550,8 +1451,7 @@ define <16 x i16> @test_masked_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x
define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15]
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -1564,8 +1464,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16
define <16 x i16> @test_masked_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14]
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -1578,8 +1477,7 @@ define <16 x i16> @test_masked_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x
define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14]
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -1601,8 +1499,7 @@ define <16 x i16> @test_16xi16_perm_low_mem_mask3(<16 x i16>* %vp) {
define <16 x i16> @test_masked_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15]
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -1615,8 +1512,7 @@ define <16 x i16> @test_masked_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x
define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15]
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -1629,8 +1525,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16
define <16 x i16> @test_masked_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask4:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15]
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -1643,8 +1538,7 @@ define <16 x i16> @test_masked_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x
define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask4:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15]
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -1657,8 +1551,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16
define <16 x i16> @test_masked_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15]
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -1671,8 +1564,7 @@ define <16 x i16> @test_masked_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x
define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15]
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -1694,8 +1586,7 @@ define <16 x i16> @test_16xi16_perm_high_mem_mask6(<16 x i16>* %vp) {
define <16 x i16> @test_masked_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask6:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13]
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -1708,8 +1599,7 @@ define <16 x i16> @test_masked_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x
define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask6:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13]
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -1722,8 +1612,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16
define <16 x i16> @test_masked_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask7:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15]
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -1736,8 +1625,7 @@ define <16 x i16> @test_masked_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x
define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask7:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15]
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -1758,8 +1646,7 @@ define <32 x i16> @test_32xi16_perm_high_mask0(<32 x i16> %vec) {
define <32 x i16> @test_masked_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_perm_high_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmw %zmm2, %zmm2, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -1772,8 +1659,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16
define <32 x i16> @test_masked_z_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28]
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
@@ -1784,8 +1670,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i
define <32 x i16> @test_masked_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_perm_low_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmw %zmm2, %zmm2, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -1798,8 +1683,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16>
define <32 x i16> @test_masked_z_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31]
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
@@ -1810,8 +1694,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i1
define <32 x i16> @test_masked_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_perm_high_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmw %zmm2, %zmm2, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -1824,8 +1707,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16
define <32 x i16> @test_masked_z_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31]
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 14, i32 12, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 22, i32 20, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 30, i32 28, i32 31>
@@ -1844,8 +1726,7 @@ define <32 x i16> @test_32xi16_perm_low_mask3(<32 x i16> %vec) {
define <32 x i16> @test_masked_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_perm_low_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmw %zmm2, %zmm2, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -1858,8 +1739,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16>
define <32 x i16> @test_masked_z_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31]
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -1870,8 +1750,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i1
define <32 x i16> @test_masked_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_perm_high_mask4:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmw %zmm2, %zmm2, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -1884,8 +1763,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16
define <32 x i16> @test_masked_z_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask4:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30]
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 23, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 31, i32 29, i32 30>
@@ -1896,8 +1774,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i
define <32 x i16> @test_masked_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_perm_low_mask5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmw %zmm2, %zmm2, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -1910,8 +1787,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16>
define <32 x i16> @test_masked_z_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31]
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
@@ -1930,8 +1806,7 @@ define <32 x i16> @test_32xi16_perm_high_mask6(<32 x i16> %vec) {
define <32 x i16> @test_masked_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_perm_high_mask6:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmw %zmm2, %zmm2, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -1944,8 +1819,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16
define <32 x i16> @test_masked_z_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask6:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30]
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
@@ -1956,8 +1830,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i
define <32 x i16> @test_masked_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_perm_low_mask7:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmw %zmm2, %zmm2, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -1970,8 +1843,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16>
define <32 x i16> @test_masked_z_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask7:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31]
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 0, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
@@ -1991,8 +1863,7 @@ define <32 x i16> @test_32xi16_perm_high_mem_mask0(<32 x i16>* %vp) {
define <32 x i16> @test_masked_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30]
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -2005,8 +1876,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x
define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30]
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -2019,8 +1889,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32
define <32 x i16> @test_masked_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31]
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -2033,8 +1902,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x
define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31]
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -2047,8 +1915,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32
define <32 x i16> @test_masked_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28]
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -2061,8 +1928,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x
define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28]
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -2084,8 +1950,7 @@ define <32 x i16> @test_32xi16_perm_low_mem_mask3(<32 x i16>* %vp) {
define <32 x i16> @test_masked_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31]
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -2098,8 +1963,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x
define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31]
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -2112,8 +1976,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32
define <32 x i16> @test_masked_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask4:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29]
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -2126,8 +1989,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x
define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask4:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29]
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -2141,8 +2003,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x
; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask5:
; CHECK: # %bb.0:
; CHECK-NEXT: vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -2156,8 +2017,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32
; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask5:
; CHECK: # %bb.0:
; CHECK-NEXT: vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %zmm2, %zmm0, %k1
+; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1
; CHECK-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -2179,8 +2039,7 @@ define <32 x i16> @test_32xi16_perm_high_mem_mask6(<32 x i16>* %vp) {
define <32 x i16> @test_masked_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask6:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30]
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -2193,8 +2052,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x
define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask6:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1
; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30]
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -2207,8 +2065,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32
define <32 x i16> @test_masked_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask7:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31]
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -2221,8 +2078,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x
define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask7:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1
; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31]
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -2243,8 +2099,7 @@ define <4 x i32> @test_4xi32_perm_mask0(<4 x i32> %vec) {
define <4 x i32> @test_masked_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_4xi32_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -2257,8 +2112,7 @@ define <4 x i32> @test_masked_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %vec2,
define <4 x i32> @test_masked_z_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_4xi32_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
@@ -2269,8 +2123,7 @@ define <4 x i32> @test_masked_z_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %mask
define <4 x i32> @test_masked_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_4xi32_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -2283,8 +2136,7 @@ define <4 x i32> @test_masked_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %vec2,
define <4 x i32> @test_masked_z_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_4xi32_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 0>
@@ -2295,8 +2147,7 @@ define <4 x i32> @test_masked_z_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %mask
define <4 x i32> @test_masked_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_4xi32_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -2309,8 +2160,7 @@ define <4 x i32> @test_masked_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %vec2,
define <4 x i32> @test_masked_z_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_4xi32_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 0>
@@ -2329,8 +2179,7 @@ define <4 x i32> @test_4xi32_perm_mask3(<4 x i32> %vec) {
define <4 x i32> @test_masked_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_4xi32_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -2343,8 +2192,7 @@ define <4 x i32> @test_masked_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %vec2,
define <4 x i32> @test_masked_z_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_4xi32_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
@@ -2364,8 +2212,7 @@ define <4 x i32> @test_4xi32_perm_mem_mask0(<4 x i32>* %vp) {
define <4 x i32> @test_masked_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_4xi32_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3]
; CHECK-NEXT: retq
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -2378,8 +2225,7 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %ve
define <4 x i32> @test_masked_z_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3]
; CHECK-NEXT: retq
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -2392,8 +2238,7 @@ define <4 x i32> @test_masked_z_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %
define <4 x i32> @test_masked_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_4xi32_perm_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1]
; CHECK-NEXT: retq
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -2406,8 +2251,7 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %ve
define <4 x i32> @test_masked_z_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1]
; CHECK-NEXT: retq
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -2420,8 +2264,7 @@ define <4 x i32> @test_masked_z_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %
define <4 x i32> @test_masked_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_4xi32_perm_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1]
; CHECK-NEXT: retq
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -2434,8 +2277,7 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %ve
define <4 x i32> @test_masked_z_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1]
; CHECK-NEXT: retq
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -2457,8 +2299,7 @@ define <4 x i32> @test_4xi32_perm_mem_mask3(<4 x i32>* %vp) {
define <4 x i32> @test_masked_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_4xi32_perm_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0]
; CHECK-NEXT: retq
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -2471,8 +2312,7 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %ve
define <4 x i32> @test_masked_z_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0]
; CHECK-NEXT: retq
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -2493,8 +2333,7 @@ define <8 x i32> @test_8xi32_perm_mask0(<8 x i32> %vec) {
define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -2507,8 +2346,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2,
define <8 x i32> @test_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
@@ -2519,8 +2357,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask
define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -2533,8 +2370,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2,
define <8 x i32> @test_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 3, i32 4, i32 7, i32 7, i32 7>
@@ -2545,8 +2381,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask
define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -2559,8 +2394,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2,
define <8 x i32> @test_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 5, i32 6, i32 4, i32 7>
@@ -2579,8 +2413,7 @@ define <8 x i32> @test_8xi32_perm_mask3(<8 x i32> %vec) {
define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -2593,8 +2426,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2,
define <8 x i32> @test_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
@@ -2614,8 +2446,7 @@ define <8 x i32> @test_8xi32_perm_mem_mask0(<8 x i32>* %vp) {
define <8 x i32> @test_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4]
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -2628,8 +2459,7 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %ve
define <8 x i32> @test_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4]
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -2642,8 +2472,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %
define <8 x i32> @test_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_perm_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4]
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -2656,8 +2485,7 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %ve
define <8 x i32> @test_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4]
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -2670,8 +2498,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %
define <8 x i32> @test_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_perm_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5]
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -2684,8 +2511,7 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %ve
define <8 x i32> @test_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5]
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -2707,8 +2533,7 @@ define <8 x i32> @test_8xi32_perm_mem_mask3(<8 x i32>* %vp) {
define <8 x i32> @test_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_perm_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4]
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -2721,8 +2546,7 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %ve
define <8 x i32> @test_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4]
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -2743,8 +2567,7 @@ define <16 x i32> @test_16xi32_perm_mask0(<16 x i32> %vec) {
define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1
; CHECK-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -2757,8 +2580,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %ve
define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12]
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
@@ -2769,8 +2591,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %
define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1
; CHECK-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -2783,8 +2604,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %ve
define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12]
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 0, i32 6, i32 4, i32 7, i32 4, i32 10, i32 8, i32 11, i32 8, i32 14, i32 12, i32 15, i32 12>
@@ -2795,8 +2615,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %
define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1
; CHECK-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -2809,8 +2628,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %ve
define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12]
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4, i32 9, i32 11, i32 11, i32 8, i32 13, i32 15, i32 15, i32 12>
@@ -2829,8 +2647,7 @@ define <16 x i32> @test_16xi32_perm_mask3(<16 x i32> %vec) {
define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1
; CHECK-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -2843,8 +2660,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %ve
define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15]
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
@@ -2864,8 +2680,7 @@ define <16 x i32> @test_16xi32_perm_mem_mask0(<16 x i32>* %vp) {
define <16 x i32> @test_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15]
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -2878,8 +2693,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32>
define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15]
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -2892,8 +2706,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i3
define <16 x i32> @test_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_perm_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14]
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -2906,8 +2719,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32>
define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14]
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -2920,8 +2732,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i3
define <16 x i32> @test_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_perm_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14]
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -2934,8 +2745,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32>
define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14]
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -2957,8 +2767,7 @@ define <16 x i32> @test_16xi32_perm_mem_mask3(<16 x i32>* %vp) {
define <16 x i32> @test_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_perm_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1
; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13]
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -2971,8 +2780,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32>
define <16 x i32> @test_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1
; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13]
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
diff --git a/llvm/test/CodeGen/X86/avx512-vbroadcast.ll b/llvm/test/CodeGen/X86/avx512-vbroadcast.ll
index 09e7e646ca4..9792c4990c6 100644
--- a/llvm/test/CodeGen/X86/avx512-vbroadcast.ll
+++ b/llvm/test/CodeGen/X86/avx512-vbroadcast.ll
@@ -44,8 +44,7 @@ define <16 x float> @_inreg16xfloat(float %a) {
define <16 x float> @_ss16xfloat_mask(float %a, <16 x float> %i, <16 x i32> %mask1) {
; ALL-LABEL: _ss16xfloat_mask:
; ALL: # %bb.0:
-; ALL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; ALL-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
+; ALL-NEXT: vptestmd %zmm2, %zmm2, %k1
; ALL-NEXT: vbroadcastss %xmm0, %zmm1 {%k1}
; ALL-NEXT: vmovaps %zmm1, %zmm0
; ALL-NEXT: retq
@@ -59,8 +58,7 @@ define <16 x float> @_ss16xfloat_mask(float %a, <16 x float> %i, <16 x i32> %m
define <16 x float> @_ss16xfloat_maskz(float %a, <16 x i32> %mask1) {
; ALL-LABEL: _ss16xfloat_maskz:
; ALL: # %bb.0:
-; ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; ALL-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
+; ALL-NEXT: vptestmd %zmm1, %zmm1, %k1
; ALL-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
; ALL-NEXT: retq
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -84,8 +82,7 @@ define <16 x float> @_ss16xfloat_load(float* %a.ptr) {
define <16 x float> @_ss16xfloat_mask_load(float* %a.ptr, <16 x float> %i, <16 x i32> %mask1) {
; ALL-LABEL: _ss16xfloat_mask_load:
; ALL: # %bb.0:
-; ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; ALL-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
+; ALL-NEXT: vptestmd %zmm1, %zmm1, %k1
; ALL-NEXT: vbroadcastss (%rdi), %zmm0 {%k1}
; ALL-NEXT: retq
%a = load float, float* %a.ptr
@@ -99,8 +96,7 @@ define <16 x float> @_ss16xfloat_mask_load(float* %a.ptr, <16 x float> %i, <16
define <16 x float> @_ss16xfloat_maskz_load(float* %a.ptr, <16 x i32> %mask1) {
; ALL-LABEL: _ss16xfloat_maskz_load:
; ALL: # %bb.0:
-; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; ALL-NEXT: vpcmpneqd %zmm1, %zmm0, %k1
+; ALL-NEXT: vptestmd %zmm0, %zmm0, %k1
; ALL-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} {z}
; ALL-NEXT: retq
%a = load float, float* %a.ptr
@@ -125,8 +121,7 @@ define <8 x double> @_sd8xdouble_mask(double %a, <8 x double> %i, <8 x i32> %m
; ALL-LABEL: _sd8xdouble_mask:
; ALL: # %bb.0:
; ALL-NEXT: # kill: def %ymm2 killed %ymm2 def %zmm2
-; ALL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; ALL-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
+; ALL-NEXT: vptestmd %zmm2, %zmm2, %k1
; ALL-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1}
; ALL-NEXT: vmovapd %zmm1, %zmm0
; ALL-NEXT: retq
@@ -141,8 +136,7 @@ define <8 x double> @_sd8xdouble_maskz(double %a, <8 x i32> %mask1) {
; ALL-LABEL: _sd8xdouble_maskz:
; ALL: # %bb.0:
; ALL-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
-; ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; ALL-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
+; ALL-NEXT: vptestmd %zmm1, %zmm1, %k1
; ALL-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
; ALL-NEXT: retq
%mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -167,8 +161,7 @@ define <8 x double> @_sd8xdouble_mask_load(double* %a.ptr, <8 x double> %i, <8
; ALL-LABEL: _sd8xdouble_mask_load:
; ALL: # %bb.0:
; ALL-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
-; ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; ALL-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
+; ALL-NEXT: vptestmd %zmm1, %zmm1, %k1
; ALL-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1}
; ALL-NEXT: retq
%a = load double, double* %a.ptr
@@ -183,8 +176,7 @@ define <8 x double> @_sd8xdouble_maskz_load(double* %a.ptr, <8 x i32> %mask1)
; ALL-LABEL: _sd8xdouble_maskz_load:
; ALL: # %bb.0:
; ALL-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
-; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; ALL-NEXT: vpcmpneqd %zmm1, %zmm0, %k1
+; ALL-NEXT: vptestmd %zmm0, %zmm0, %k1
; ALL-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} {z}
; ALL-NEXT: retq
%a = load double, double* %a.ptr
diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
index 3d552da7330..b84d61b5a25 100644
--- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -926,8 +926,7 @@ define <2 x i64> @test46(<2 x float> %x, <2 x float> %y) #0 {
define <16 x i8> @test47(<16 x i32> %a, <16 x i8> %b, <16 x i8> %c) {
; KNL-LABEL: test47:
; KNL: ## %bb.0:
-; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; KNL-NEXT: vpcmpeqd %zmm3, %zmm0, %k1
+; KNL-NEXT: vptestnmd %zmm0, %zmm0, %k1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
@@ -938,8 +937,7 @@ define <16 x i8> @test47(<16 x i32> %a, <16 x i8> %b, <16 x i8> %c) {
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: ## kill: def %xmm2 killed %xmm2 def %zmm2
; AVX512BW-NEXT: ## kill: def %xmm1 killed %xmm1 def %zmm1
-; AVX512BW-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512BW-NEXT: vpcmpeqd %zmm3, %zmm0, %k1
+; AVX512BW-NEXT: vptestnmd %zmm0, %zmm0, %k1
; AVX512BW-NEXT: vpblendmb %zmm1, %zmm2, %zmm0 {%k1}
; AVX512BW-NEXT: ## kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512BW-NEXT: vzeroupper
@@ -947,8 +945,7 @@ define <16 x i8> @test47(<16 x i32> %a, <16 x i8> %b, <16 x i8> %c) {
;
; SKX-LABEL: test47:
; SKX: ## %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm0, %k1
+; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1
; SKX-NEXT: vpblendmb %xmm1, %xmm2, %xmm0 {%k1}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -960,8 +957,7 @@ define <16 x i8> @test47(<16 x i32> %a, <16 x i8> %b, <16 x i8> %c) {
define <16 x i16> @test48(<16 x i32> %a, <16 x i16> %b, <16 x i16> %c) {
; KNL-LABEL: test48:
; KNL: ## %bb.0:
-; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; KNL-NEXT: vpcmpeqd %zmm3, %zmm0, %k1
+; KNL-NEXT: vptestnmd %zmm0, %zmm0, %k1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdw %zmm0, %ymm0
; KNL-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
@@ -971,16 +967,14 @@ define <16 x i16> @test48(<16 x i32> %a, <16 x i16> %b, <16 x i16> %c) {
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: ## kill: def %ymm2 killed %ymm2 def %zmm2
; AVX512BW-NEXT: ## kill: def %ymm1 killed %ymm1 def %zmm1
-; AVX512BW-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512BW-NEXT: vpcmpeqd %zmm3, %zmm0, %k1
+; AVX512BW-NEXT: vptestnmd %zmm0, %zmm0, %k1
; AVX512BW-NEXT: vpblendmw %zmm1, %zmm2, %zmm0 {%k1}
; AVX512BW-NEXT: ## kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512BW-NEXT: retq
;
; SKX-LABEL: test48:
; SKX: ## %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm0, %k1
+; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1
; SKX-NEXT: vpblendmw %ymm1, %ymm2, %ymm0 {%k1}
; SKX-NEXT: retq
%cmp = icmp eq <16 x i32> %a, zeroinitializer
@@ -991,8 +985,7 @@ define <16 x i16> @test48(<16 x i32> %a, <16 x i16> %b, <16 x i16> %c) {
define <8 x i16> @test49(<8 x i64> %a, <8 x i16> %b, <8 x i16> %c) {
; KNL-LABEL: test49:
; KNL: ## %bb.0:
-; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; KNL-NEXT: vpcmpeqq %zmm3, %zmm0, %k1
+; KNL-NEXT: vptestnmq %zmm0, %zmm0, %k1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdw %zmm0, %ymm0
; KNL-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
@@ -1003,8 +996,7 @@ define <8 x i16> @test49(<8 x i64> %a, <8 x i16> %b, <8 x i16> %c) {
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: ## kill: def %xmm2 killed %xmm2 def %zmm2
; AVX512BW-NEXT: ## kill: def %xmm1 killed %xmm1 def %zmm1
-; AVX512BW-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512BW-NEXT: vpcmpeqq %zmm3, %zmm0, %k1
+; AVX512BW-NEXT: vptestnmq %zmm0, %zmm0, %k1
; AVX512BW-NEXT: vpblendmw %zmm1, %zmm2, %zmm0 {%k1}
; AVX512BW-NEXT: ## kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512BW-NEXT: vzeroupper
@@ -1012,8 +1004,7 @@ define <8 x i16> @test49(<8 x i64> %a, <8 x i16> %b, <8 x i16> %c) {
;
; SKX-LABEL: test49:
; SKX: ## %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm0, %k1
+; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1
; SKX-NEXT: vpblendmw %xmm1, %xmm2, %xmm0 {%k1}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/avx512bw-mov.ll b/llvm/test/CodeGen/X86/avx512bw-mov.ll
index 7158fb262c0..e968d76994f 100644
--- a/llvm/test/CodeGen/X86/avx512bw-mov.ll
+++ b/llvm/test/CodeGen/X86/avx512bw-mov.ll
@@ -24,8 +24,7 @@ define void @test2(i8 * %addr, <64 x i8> %data) {
define <64 x i8> @test3(i8 * %addr, <64 x i8> %old, <64 x i8> %mask1) {
; CHECK-LABEL: test3:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpneqb %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestmb %zmm1, %zmm1, %k1
; CHECK-NEXT: vmovdqu8 (%rdi), %zmm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp ne <64 x i8> %mask1, zeroinitializer
@@ -38,8 +37,7 @@ define <64 x i8> @test3(i8 * %addr, <64 x i8> %old, <64 x i8> %mask1) {
define <64 x i8> @test4(i8 * %addr, <64 x i8> %mask1) {
; CHECK-LABEL: test4:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpneqb %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestmb %zmm0, %zmm0, %k1
; CHECK-NEXT: vmovdqu8 (%rdi), %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%mask = icmp ne <64 x i8> %mask1, zeroinitializer
@@ -72,8 +70,7 @@ define void @test6(i8 * %addr, <32 x i16> %data) {
define <32 x i16> @test7(i8 * %addr, <32 x i16> %old, <32 x i16> %mask1) {
; CHECK-LABEL: test7:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpneqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vptestmw %zmm1, %zmm1, %k1
; CHECK-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp ne <32 x i16> %mask1, zeroinitializer
@@ -86,8 +83,7 @@ define <32 x i16> @test7(i8 * %addr, <32 x i16> %old, <32 x i16> %mask1) {
define <32 x i16> @test8(i8 * %addr, <32 x i16> %mask1) {
; CHECK-LABEL: test8:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpneqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vptestmw %zmm0, %zmm0, %k1
; CHECK-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%mask = icmp ne <32 x i16> %mask1, zeroinitializer
diff --git a/llvm/test/CodeGen/X86/avx512bwvl-mov.ll b/llvm/test/CodeGen/X86/avx512bwvl-mov.ll
index 1826890d49c..508595e4366 100644
--- a/llvm/test/CodeGen/X86/avx512bwvl-mov.ll
+++ b/llvm/test/CodeGen/X86/avx512bwvl-mov.ll
@@ -24,8 +24,7 @@ define void @test_256_2(i8 * %addr, <32 x i8> %data) {
define <32 x i8> @test_256_3(i8 * %addr, <32 x i8> %old, <32 x i8> %mask1) {
; CHECK-LABEL: test_256_3:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqb %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x3f,0xca,0x04]
+; CHECK-NEXT: vptestmb %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0x75,0x28,0x26,0xc9]
; CHECK-NEXT: vmovdqu8 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7f,0x29,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <32 x i8> %mask1, zeroinitializer
@@ -38,8 +37,7 @@ define <32 x i8> @test_256_3(i8 * %addr, <32 x i8> %old, <32 x i8> %mask1) {
define <32 x i8> @test_256_4(i8 * %addr, <32 x i8> %mask1) {
; CHECK-LABEL: test_256_4:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc9,0x04]
+; CHECK-NEXT: vptestmb %ymm0, %ymm0, %k1 ## encoding: [0x62,0xf2,0x7d,0x28,0x26,0xc8]
; CHECK-NEXT: vmovdqu8 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0xa9,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <32 x i8> %mask1, zeroinitializer
@@ -72,8 +70,7 @@ define void @test_256_6(i8 * %addr, <16 x i16> %data) {
define <16 x i16> @test_256_7(i8 * %addr, <16 x i16> %old, <16 x i16> %mask1) {
; CHECK-LABEL: test_256_7:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqw %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x3f,0xca,0x04]
+; CHECK-NEXT: vptestmw %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x28,0x26,0xc9]
; CHECK-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <16 x i16> %mask1, zeroinitializer
@@ -86,8 +83,7 @@ define <16 x i16> @test_256_7(i8 * %addr, <16 x i16> %old, <16 x i16> %mask1) {
define <16 x i16> @test_256_8(i8 * %addr, <16 x i16> %mask1) {
; CHECK-LABEL: test_256_8:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT: vpcmpneqw %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc9,0x04]
+; CHECK-NEXT: vptestmw %ymm0, %ymm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x28,0x26,0xc8]
; CHECK-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xa9,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <16 x i16> %mask1, zeroinitializer
@@ -120,8 +116,7 @@ define void @test_128_2(i8 * %addr, <16 x i8> %data) {
define <16 x i8> @test_128_3(i8 * %addr, <16 x i8> %old, <16 x i8> %mask1) {
; CHECK-LABEL: test_128_3:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqb %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x3f,0xca,0x04]
+; CHECK-NEXT: vptestmb %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x08,0x26,0xc9]
; CHECK-NEXT: vmovdqu8 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7f,0x09,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <16 x i8> %mask1, zeroinitializer
@@ -134,8 +129,7 @@ define <16 x i8> @test_128_3(i8 * %addr, <16 x i8> %old, <16 x i8> %mask1) {
define <16 x i8> @test_128_4(i8 * %addr, <16 x i8> %mask1) {
; CHECK-LABEL: test_128_4:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT: vpcmpneqb %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc9,0x04]
+; CHECK-NEXT: vptestmb %xmm0, %xmm0, %k1 ## encoding: [0x62,0xf2,0x7d,0x08,0x26,0xc8]
; CHECK-NEXT: vmovdqu8 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0x89,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <16 x i8> %mask1, zeroinitializer
@@ -168,8 +162,7 @@ define void @test_128_6(i8 * %addr, <8 x i16> %data) {
define <8 x i16> @test_128_7(i8 * %addr, <8 x i16> %old, <8 x i16> %mask1) {
; CHECK-LABEL: test_128_7:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqw %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x3f,0xca,0x04]
+; CHECK-NEXT: vptestmw %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x08,0x26,0xc9]
; CHECK-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i16> %mask1, zeroinitializer
@@ -182,8 +175,7 @@ define <8 x i16> @test_128_7(i8 * %addr, <8 x i16> %old, <8 x i16> %mask1) {
define <8 x i16> @test_128_8(i8 * %addr, <8 x i16> %mask1) {
; CHECK-LABEL: test_128_8:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT: vpcmpneqw %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xc9,0x04]
+; CHECK-NEXT: vptestmw %xmm0, %xmm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x08,0x26,0xc8]
; CHECK-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i16> %mask1, zeroinitializer
diff --git a/llvm/test/CodeGen/X86/avx512vl-arith.ll b/llvm/test/CodeGen/X86/avx512vl-arith.ll
index beaefe92aac..967ac3b7948 100755
--- a/llvm/test/CodeGen/X86/avx512vl-arith.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-arith.ll
@@ -76,8 +76,7 @@ define <8 x i32> @vpaddd256_broadcast_test(<8 x i32> %i) nounwind {
define <8 x i32> @vpaddd256_mask_test(<8 x i32> %i, <8 x i32> %j, <8 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd256_mask_test:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb]
-; CHECK-NEXT: vpcmpneqd %ymm3, %ymm2, %k1 ## encoding: [0x62,0xf3,0x6d,0x28,0x1f,0xcb,0x04]
+; CHECK-NEXT: vptestmd %ymm2, %ymm2, %k1 ## encoding: [0x62,0xf2,0x6d,0x28,0x27,0xca]
; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfe,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -89,8 +88,7 @@ define <8 x i32> @vpaddd256_mask_test(<8 x i32> %i, <8 x i32> %j, <8 x i32> %mas
define <8 x i32> @vpaddd256_maskz_test(<8 x i32> %i, <8 x i32> %j, <8 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd256_maskz_test:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb]
-; CHECK-NEXT: vpcmpneqd %ymm3, %ymm2, %k1 ## encoding: [0x62,0xf3,0x6d,0x28,0x1f,0xcb,0x04]
+; CHECK-NEXT: vptestmd %ymm2, %ymm2, %k1 ## encoding: [0x62,0xf2,0x6d,0x28,0x27,0xca]
; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfe,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -102,8 +100,7 @@ define <8 x i32> @vpaddd256_maskz_test(<8 x i32> %i, <8 x i32> %j, <8 x i32> %ma
define <8 x i32> @vpaddd256_mask_fold_test(<8 x i32> %i, <8 x i32>* %j.ptr, <8 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd256_mask_fold_test:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT: vptestmd %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0x75,0x28,0x27,0xc9]
; CHECK-NEXT: vpaddd (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfe,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -116,8 +113,7 @@ define <8 x i32> @vpaddd256_mask_fold_test(<8 x i32> %i, <8 x i32>* %j.ptr, <8 x
define <8 x i32> @vpaddd256_mask_broadcast_test(<8 x i32> %i, <8 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd256_mask_broadcast_test:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT: vptestmd %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0x75,0x28,0x27,0xc9]
; CHECK-NEXT: vpaddd {{.*}}(%rip){1to8}, %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xfe,0x05,A,A,A,A]
; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI10_0-4, kind: reloc_riprel_4byte
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -130,8 +126,7 @@ define <8 x i32> @vpaddd256_mask_broadcast_test(<8 x i32> %i, <8 x i32> %mask1)
define <8 x i32> @vpaddd256_maskz_fold_test(<8 x i32> %i, <8 x i32>* %j.ptr, <8 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd256_maskz_fold_test:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT: vptestmd %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0x75,0x28,0x27,0xc9]
; CHECK-NEXT: vpaddd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfe,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -144,8 +139,7 @@ define <8 x i32> @vpaddd256_maskz_fold_test(<8 x i32> %i, <8 x i32>* %j.ptr, <8
define <8 x i32> @vpaddd256_maskz_broadcast_test(<8 x i32> %i, <8 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd256_maskz_broadcast_test:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT: vptestmd %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0x75,0x28,0x27,0xc9]
; CHECK-NEXT: vpaddd {{.*}}(%rip){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xfe,0x05,A,A,A,A]
; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI12_0-4, kind: reloc_riprel_4byte
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -216,8 +210,7 @@ define <8 x float> @test_broadcast_vaddpd_256(<8 x float> %a) nounwind {
define <8 x float> @test_mask_vaddps_256(<8 x float> %dst, <8 x float> %i, <8 x float> %j, <8 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vaddps_256:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0x65,0x28,0x1f,0xcc,0x04]
+; CHECK-NEXT: vptestmd %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0x65,0x28,0x27,0xcb]
; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0x58,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -229,8 +222,7 @@ define <8 x float> @test_mask_vaddps_256(<8 x float> %dst, <8 x float> %i, <8 x
define <8 x float> @test_mask_vmulps_256(<8 x float> %dst, <8 x float> %i, <8 x float> %j, <8 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vmulps_256:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0x65,0x28,0x1f,0xcc,0x04]
+; CHECK-NEXT: vptestmd %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0x65,0x28,0x27,0xcb]
; CHECK-NEXT: vmulps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0x59,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -242,8 +234,7 @@ define <8 x float> @test_mask_vmulps_256(<8 x float> %dst, <8 x float> %i, <8 x
define <8 x float> @test_mask_vminps_256(<8 x float> %dst, <8 x float> %i, <8 x float> %j, <8 x i32> %mask1)nounwind readnone {
; CHECK-LABEL: test_mask_vminps_256:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0x65,0x28,0x1f,0xcc,0x04]
+; CHECK-NEXT: vptestmd %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0x65,0x28,0x27,0xcb]
; CHECK-NEXT: vminps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0x5d,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -256,8 +247,7 @@ define <8 x float> @test_mask_vminps_256(<8 x float> %dst, <8 x float> %i, <8 x
define <8 x float> @test_mask_vmaxps_256(<8 x float> %dst, <8 x float> %i, <8 x float> %j, <8 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vmaxps_256:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0x65,0x28,0x1f,0xcc,0x04]
+; CHECK-NEXT: vptestmd %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0x65,0x28,0x27,0xcb]
; CHECK-NEXT: vmaxps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0x5f,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -270,8 +260,7 @@ define <8 x float> @test_mask_vmaxps_256(<8 x float> %dst, <8 x float> %i, <8 x
define <8 x float> @test_mask_vsubps_256(<8 x float> %dst, <8 x float> %i, <8 x float> %j, <8 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vsubps_256:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0x65,0x28,0x1f,0xcc,0x04]
+; CHECK-NEXT: vptestmd %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0x65,0x28,0x27,0xcb]
; CHECK-NEXT: vsubps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0x5c,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -283,8 +272,7 @@ define <8 x float> @test_mask_vsubps_256(<8 x float> %dst, <8 x float> %i, <8 x
define <8 x float> @test_mask_vdivps_256(<8 x float> %dst, <8 x float> %i, <8 x float> %j, <8 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vdivps_256:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0x65,0x28,0x1f,0xcc,0x04]
+; CHECK-NEXT: vptestmd %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0x65,0x28,0x27,0xcb]
; CHECK-NEXT: vdivps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0x5e,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -296,8 +284,7 @@ define <8 x float> @test_mask_vdivps_256(<8 x float> %dst, <8 x float> %i, <8 x
define <4 x double> @test_mask_vmulpd_256(<4 x double> %dst, <4 x double> %i, <4 x double> %j, <4 x i64> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vmulpd_256:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT: vpcmpneqq %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x28,0x1f,0xcc,0x04]
+; CHECK-NEXT: vptestmq %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x28,0x27,0xcb]
; CHECK-NEXT: vmulpd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x59,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -309,8 +296,7 @@ define <4 x double> @test_mask_vmulpd_256(<4 x double> %dst, <4 x double> %i, <4
define <4 x double> @test_mask_vminpd_256(<4 x double> %dst, <4 x double> %i, <4 x double> %j, <4 x i64> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vminpd_256:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT: vpcmpneqq %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x28,0x1f,0xcc,0x04]
+; CHECK-NEXT: vptestmq %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x28,0x27,0xcb]
; CHECK-NEXT: vminpd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x5d,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -323,8 +309,7 @@ define <4 x double> @test_mask_vminpd_256(<4 x double> %dst, <4 x double> %i, <4
define <4 x double> @test_mask_vmaxpd_256(<4 x double> %dst, <4 x double> %i, <4 x double> %j, <4 x i64> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vmaxpd_256:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT: vpcmpneqq %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x28,0x1f,0xcc,0x04]
+; CHECK-NEXT: vptestmq %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x28,0x27,0xcb]
; CHECK-NEXT: vmaxpd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x5f,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -337,8 +322,7 @@ define <4 x double> @test_mask_vmaxpd_256(<4 x double> %dst, <4 x double> %i, <4
define <4 x double> @test_mask_vsubpd_256(<4 x double> %dst, <4 x double> %i, <4 x double> %j, <4 x i64> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vsubpd_256:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT: vpcmpneqq %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x28,0x1f,0xcc,0x04]
+; CHECK-NEXT: vptestmq %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x28,0x27,0xcb]
; CHECK-NEXT: vsubpd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x5c,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -350,8 +334,7 @@ define <4 x double> @test_mask_vsubpd_256(<4 x double> %dst, <4 x double> %i, <4
define <4 x double> @test_mask_vdivpd_256(<4 x double> %dst, <4 x double> %i, <4 x double> %j, <4 x i64> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vdivpd_256:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT: vpcmpneqq %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x28,0x1f,0xcc,0x04]
+; CHECK-NEXT: vptestmq %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x28,0x27,0xcb]
; CHECK-NEXT: vdivpd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x5e,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -363,8 +346,7 @@ define <4 x double> @test_mask_vdivpd_256(<4 x double> %dst, <4 x double> %i, <4
define <4 x double> @test_mask_vaddpd_256(<4 x double> %dst, <4 x double> %i, <4 x double> %j, <4 x i64> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vaddpd_256:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT: vpcmpneqq %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x28,0x1f,0xcc,0x04]
+; CHECK-NEXT: vptestmq %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x28,0x27,0xcb]
; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x58,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -376,8 +358,7 @@ define <4 x double> @test_mask_vaddpd_256(<4 x double> %dst, <4 x double> %i, <4
define <4 x double> @test_maskz_vaddpd_256(<4 x double> %i, <4 x double> %j, <4 x i64> %mask1) nounwind readnone {
; CHECK-LABEL: test_maskz_vaddpd_256:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb]
-; CHECK-NEXT: vpcmpneqq %ymm3, %ymm2, %k1 ## encoding: [0x62,0xf3,0xed,0x28,0x1f,0xcb,0x04]
+; CHECK-NEXT: vptestmq %ymm2, %ymm2, %k1 ## encoding: [0x62,0xf2,0xed,0x28,0x27,0xca]
; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x58,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -389,8 +370,7 @@ define <4 x double> @test_maskz_vaddpd_256(<4 x double> %i, <4 x double> %j, <4
define <4 x double> @test_mask_fold_vaddpd_256(<4 x double> %dst, <4 x double> %i, <4 x double>* %j, <4 x i64> %mask1) nounwind {
; CHECK-LABEL: test_mask_fold_vaddpd_256:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb]
-; CHECK-NEXT: vpcmpneqq %ymm3, %ymm2, %k1 ## encoding: [0x62,0xf3,0xed,0x28,0x1f,0xcb,0x04]
+; CHECK-NEXT: vptestmq %ymm2, %ymm2, %k1 ## encoding: [0x62,0xf2,0xed,0x28,0x27,0xca]
; CHECK-NEXT: vaddpd (%rdi), %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x58,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -403,8 +383,7 @@ define <4 x double> @test_mask_fold_vaddpd_256(<4 x double> %dst, <4 x double> %
define <4 x double> @test_maskz_fold_vaddpd_256(<4 x double> %i, <4 x double>* %j, <4 x i64> %mask1) nounwind {
; CHECK-LABEL: test_maskz_fold_vaddpd_256:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT: vptestmq %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x28,0x27,0xc9]
; CHECK-NEXT: vaddpd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x58,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -429,8 +408,7 @@ define <4 x double> @test_broadcast2_vaddpd_256(<4 x double> %i, double* %j) nou
define <4 x double> @test_mask_broadcast_vaddpd_256(<4 x double> %dst, <4 x double> %i, double* %j, <4 x i64> %mask1) nounwind {
; CHECK-LABEL: test_mask_broadcast_vaddpd_256:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; CHECK-NEXT: vpcmpneqq %ymm0, %ymm2, %k1 ## encoding: [0x62,0xf3,0xed,0x28,0x1f,0xc8,0x04]
+; CHECK-NEXT: vptestmq %ymm2, %ymm2, %k1 ## encoding: [0x62,0xf2,0xed,0x28,0x27,0xca]
; CHECK-NEXT: vaddpd (%rdi){1to4}, %ymm1, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x39,0x58,0x0f]
; CHECK-NEXT: vmovapd %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -446,8 +424,7 @@ define <4 x double> @test_mask_broadcast_vaddpd_256(<4 x double> %dst, <4 x doub
define <4 x double> @test_maskz_broadcast_vaddpd_256(<4 x double> %i, double* %j, <4 x i64> %mask1) nounwind {
; CHECK-LABEL: test_maskz_broadcast_vaddpd_256:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT: vptestmq %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x28,0x27,0xc9]
; CHECK-NEXT: vaddpd (%rdi){1to4}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xb9,0x58,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -524,8 +501,7 @@ define <4 x i32> @vpaddd128_broadcast_test(<4 x i32> %i) nounwind {
define <4 x i32> @vpaddd128_mask_test(<4 x i32> %i, <4 x i32> %j, <4 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd128_mask_test:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb]
-; CHECK-NEXT: vpcmpneqd %xmm3, %xmm2, %k1 ## encoding: [0x62,0xf3,0x6d,0x08,0x1f,0xcb,0x04]
+; CHECK-NEXT: vptestmd %xmm2, %xmm2, %k1 ## encoding: [0x62,0xf2,0x6d,0x08,0x27,0xca]
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfe,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -537,8 +513,7 @@ define <4 x i32> @vpaddd128_mask_test(<4 x i32> %i, <4 x i32> %j, <4 x i32> %mas
define <4 x i32> @vpaddd128_maskz_test(<4 x i32> %i, <4 x i32> %j, <4 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd128_maskz_test:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb]
-; CHECK-NEXT: vpcmpneqd %xmm3, %xmm2, %k1 ## encoding: [0x62,0xf3,0x6d,0x08,0x1f,0xcb,0x04]
+; CHECK-NEXT: vptestmd %xmm2, %xmm2, %k1 ## encoding: [0x62,0xf2,0x6d,0x08,0x27,0xca]
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfe,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -550,8 +525,7 @@ define <4 x i32> @vpaddd128_maskz_test(<4 x i32> %i, <4 x i32> %j, <4 x i32> %ma
define <4 x i32> @vpaddd128_mask_fold_test(<4 x i32> %i, <4 x i32>* %j.ptr, <4 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd128_mask_fold_test:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x08,0x27,0xc9]
; CHECK-NEXT: vpaddd (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfe,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -564,8 +538,7 @@ define <4 x i32> @vpaddd128_mask_fold_test(<4 x i32> %i, <4 x i32>* %j.ptr, <4 x
define <4 x i32> @vpaddd128_mask_broadcast_test(<4 x i32> %i, <4 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd128_mask_broadcast_test:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x08,0x27,0xc9]
; CHECK-NEXT: vpaddd {{.*}}(%rip){1to4}, %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xfe,0x05,A,A,A,A]
; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI46_0-4, kind: reloc_riprel_4byte
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -578,8 +551,7 @@ define <4 x i32> @vpaddd128_mask_broadcast_test(<4 x i32> %i, <4 x i32> %mask1)
define <4 x i32> @vpaddd128_maskz_fold_test(<4 x i32> %i, <4 x i32>* %j.ptr, <4 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd128_maskz_fold_test:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x08,0x27,0xc9]
; CHECK-NEXT: vpaddd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfe,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -592,8 +564,7 @@ define <4 x i32> @vpaddd128_maskz_fold_test(<4 x i32> %i, <4 x i32>* %j.ptr, <4
define <4 x i32> @vpaddd128_maskz_broadcast_test(<4 x i32> %i, <4 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd128_maskz_broadcast_test:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x08,0x27,0xc9]
; CHECK-NEXT: vpaddd {{.*}}(%rip){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xfe,0x05,A,A,A,A]
; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI48_0-4, kind: reloc_riprel_4byte
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -664,8 +635,7 @@ define <4 x float> @test_broadcast_vaddpd_128(<4 x float> %a) nounwind {
define <4 x float> @test_mask_vaddps_128(<4 x float> %dst, <4 x float> %i, <4 x float> %j, <4 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vaddps_128:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT: vpcmpneqd %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0x65,0x08,0x1f,0xcc,0x04]
+; CHECK-NEXT: vptestmd %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0x65,0x08,0x27,0xcb]
; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x09,0x58,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -677,8 +647,7 @@ define <4 x float> @test_mask_vaddps_128(<4 x float> %dst, <4 x float> %i, <4 x
define <4 x float> @test_mask_vmulps_128(<4 x float> %dst, <4 x float> %i, <4 x float> %j, <4 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vmulps_128:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT: vpcmpneqd %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0x65,0x08,0x1f,0xcc,0x04]
+; CHECK-NEXT: vptestmd %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0x65,0x08,0x27,0xcb]
; CHECK-NEXT: vmulps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x09,0x59,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -690,8 +659,7 @@ define <4 x float> @test_mask_vmulps_128(<4 x float> %dst, <4 x float> %i, <4 x
define <4 x float> @test_mask_vminps_128(<4 x float> %dst, <4 x float> %i, <4 x float> %j, <4 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vminps_128:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT: vpcmpneqd %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0x65,0x08,0x1f,0xcc,0x04]
+; CHECK-NEXT: vptestmd %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0x65,0x08,0x27,0xcb]
; CHECK-NEXT: vminps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x09,0x5d,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -704,8 +672,7 @@ define <4 x float> @test_mask_vminps_128(<4 x float> %dst, <4 x float> %i, <4 x
define <4 x float> @test_mask_vmaxps_128(<4 x float> %dst, <4 x float> %i, <4 x float> %j, <4 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vmaxps_128:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT: vpcmpneqd %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0x65,0x08,0x1f,0xcc,0x04]
+; CHECK-NEXT: vptestmd %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0x65,0x08,0x27,0xcb]
; CHECK-NEXT: vmaxps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x09,0x5f,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -718,8 +685,7 @@ define <4 x float> @test_mask_vmaxps_128(<4 x float> %dst, <4 x float> %i, <4 x
define <4 x float> @test_mask_vsubps_128(<4 x float> %dst, <4 x float> %i, <4 x float> %j, <4 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vsubps_128:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT: vpcmpneqd %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0x65,0x08,0x1f,0xcc,0x04]
+; CHECK-NEXT: vptestmd %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0x65,0x08,0x27,0xcb]
; CHECK-NEXT: vsubps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x09,0x5c,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -732,8 +698,7 @@ define <4 x float> @test_mask_vsubps_128(<4 x float> %dst, <4 x float> %i, <4 x
define <4 x float> @test_mask_vdivps_128(<4 x float> %dst, <4 x float> %i, <4 x float> %j, <4 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vdivps_128:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT: vpcmpneqd %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0x65,0x08,0x1f,0xcc,0x04]
+; CHECK-NEXT: vptestmd %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0x65,0x08,0x27,0xcb]
; CHECK-NEXT: vdivps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x09,0x5e,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -745,8 +710,7 @@ define <4 x float> @test_mask_vdivps_128(<4 x float> %dst, <4 x float> %i, <4 x
define <2 x double> @test_mask_vmulpd_128(<2 x double> %dst, <2 x double> %i, <2 x double> %j, <2 x i64> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vmulpd_128:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT: vpcmpneqq %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x08,0x1f,0xcc,0x04]
+; CHECK-NEXT: vptestmq %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x08,0x27,0xcb]
; CHECK-NEXT: vmulpd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x59,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
@@ -758,8 +722,7 @@ define <2 x double> @test_mask_vmulpd_128(<2 x double> %dst, <2 x double> %i, <2
define <2 x double> @test_mask_vminpd_128(<2 x double> %dst, <2 x double> %i, <2 x double> %j, <2 x i64> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vminpd_128:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT: vpcmpneqq %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x08,0x1f,0xcc,0x04]
+; CHECK-NEXT: vptestmq %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x08,0x27,0xcb]
; CHECK-NEXT: vminpd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x5d,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
@@ -772,8 +735,7 @@ define <2 x double> @test_mask_vminpd_128(<2 x double> %dst, <2 x double> %i, <2
define <2 x double> @test_mask_vmaxpd_128(<2 x double> %dst, <2 x double> %i, <2 x double> %j, <2 x i64> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vmaxpd_128:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT: vpcmpneqq %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x08,0x1f,0xcc,0x04]
+; CHECK-NEXT: vptestmq %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x08,0x27,0xcb]
; CHECK-NEXT: vmaxpd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x5f,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
@@ -786,8 +748,7 @@ define <2 x double> @test_mask_vmaxpd_128(<2 x double> %dst, <2 x double> %i, <2
define <2 x double> @test_mask_vsubpd_128(<2 x double> %dst, <2 x double> %i, <2 x double> %j, <2 x i64> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vsubpd_128:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT: vpcmpneqq %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x08,0x1f,0xcc,0x04]
+; CHECK-NEXT: vptestmq %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x08,0x27,0xcb]
; CHECK-NEXT: vsubpd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x5c,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
@@ -799,8 +760,7 @@ define <2 x double> @test_mask_vsubpd_128(<2 x double> %dst, <2 x double> %i, <2
define <2 x double> @test_mask_vdivpd_128(<2 x double> %dst, <2 x double> %i, <2 x double> %j, <2 x i64> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vdivpd_128:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT: vpcmpneqq %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x08,0x1f,0xcc,0x04]
+; CHECK-NEXT: vptestmq %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x08,0x27,0xcb]
; CHECK-NEXT: vdivpd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x5e,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
@@ -812,8 +772,7 @@ define <2 x double> @test_mask_vdivpd_128(<2 x double> %dst, <2 x double> %i, <2
define <2 x double> @test_mask_vaddpd_128(<2 x double> %dst, <2 x double> %i, <2 x double> %j, <2 x i64> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vaddpd_128:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT: vpcmpneqq %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x08,0x1f,0xcc,0x04]
+; CHECK-NEXT: vptestmq %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x08,0x27,0xcb]
; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x58,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
@@ -825,8 +784,7 @@ define <2 x double> @test_mask_vaddpd_128(<2 x double> %dst, <2 x double> %i, <2
define <2 x double> @test_maskz_vaddpd_128(<2 x double> %i, <2 x double> %j,
; CHECK-LABEL: test_maskz_vaddpd_128:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb]
-; CHECK-NEXT: vpcmpneqq %xmm3, %xmm2, %k1 ## encoding: [0x62,0xf3,0xed,0x08,0x1f,0xcb,0x04]
+; CHECK-NEXT: vptestmq %xmm2, %xmm2, %k1 ## encoding: [0x62,0xf2,0xed,0x08,0x27,0xca]
; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x58,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
<2 x i64> %mask1) nounwind readnone {
@@ -839,8 +797,7 @@ define <2 x double> @test_maskz_vaddpd_128(<2 x double> %i, <2 x double> %j,
define <2 x double> @test_mask_fold_vaddpd_128(<2 x double> %dst, <2 x double> %i, <2 x double>* %j, <2 x i64> %mask1) nounwind {
; CHECK-LABEL: test_mask_fold_vaddpd_128:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb]
-; CHECK-NEXT: vpcmpneqq %xmm3, %xmm2, %k1 ## encoding: [0x62,0xf3,0xed,0x08,0x1f,0xcb,0x04]
+; CHECK-NEXT: vptestmq %xmm2, %xmm2, %k1 ## encoding: [0x62,0xf2,0xed,0x08,0x27,0xca]
; CHECK-NEXT: vaddpd (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x58,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
@@ -853,8 +810,7 @@ define <2 x double> @test_mask_fold_vaddpd_128(<2 x double> %dst, <2 x double> %
define <2 x double> @test_maskz_fold_vaddpd_128(<2 x double> %i, <2 x double>* %j, <2 x i64> %mask1) nounwind {
; CHECK-LABEL: test_maskz_fold_vaddpd_128:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT: vptestmq %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x08,0x27,0xc9]
; CHECK-NEXT: vaddpd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x58,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
@@ -879,8 +835,7 @@ define <2 x double> @test_broadcast2_vaddpd_128(<2 x double> %i, double* %j) nou
define <2 x double> @test_mask_broadcast_vaddpd_128(<2 x double> %dst, <2 x double> %i, double* %j, <2 x i64> %mask1) nounwind {
; CHECK-LABEL: test_mask_broadcast_vaddpd_128:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; CHECK-NEXT: vpcmpneqq %xmm0, %xmm2, %k1 ## encoding: [0x62,0xf3,0xed,0x08,0x1f,0xc8,0x04]
+; CHECK-NEXT: vptestmq %xmm2, %xmm2, %k1 ## encoding: [0x62,0xf2,0xed,0x08,0x27,0xca]
; CHECK-NEXT: vaddpd (%rdi){1to2}, %xmm1, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x19,0x58,0x0f]
; CHECK-NEXT: vmovapd %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -896,8 +851,7 @@ define <2 x double> @test_mask_broadcast_vaddpd_128(<2 x double> %dst, <2 x doub
define <2 x double> @test_maskz_broadcast_vaddpd_128(<2 x double> %i, double* %j, <2 x i64> %mask1) nounwind {
; CHECK-LABEL: test_maskz_broadcast_vaddpd_128:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT: vptestmq %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x08,0x27,0xc9]
; CHECK-NEXT: vaddpd (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x99,0x58,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
diff --git a/llvm/test/CodeGen/X86/avx512vl-mov.ll b/llvm/test/CodeGen/X86/avx512vl-mov.ll
index f0ce312305f..90d9ff3250d 100644
--- a/llvm/test/CodeGen/X86/avx512vl-mov.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-mov.ll
@@ -164,8 +164,7 @@ define <8 x float> @test_256_16(i8 * %addr) {
define <8 x i32> @test_256_17(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) {
; CHECK-LABEL: test_256_17:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT: vptestmd %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0x75,0x28,0x27,0xc9]
; CHECK-NEXT: vmovdqa32 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -178,8 +177,7 @@ define <8 x i32> @test_256_17(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) {
define <8 x i32> @test_256_18(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) {
; CHECK-LABEL: test_256_18:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT: vptestmd %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0x75,0x28,0x27,0xc9]
; CHECK-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -192,8 +190,7 @@ define <8 x i32> @test_256_18(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) {
define <8 x i32> @test_256_19(i8 * %addr, <8 x i32> %mask1) {
; CHECK-LABEL: test_256_19:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT: vpcmpneqd %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc9,0x04]
+; CHECK-NEXT: vptestmd %ymm0, %ymm0, %k1 ## encoding: [0x62,0xf2,0x7d,0x28,0x27,0xc8]
; CHECK-NEXT: vmovdqa32 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -206,8 +203,7 @@ define <8 x i32> @test_256_19(i8 * %addr, <8 x i32> %mask1) {
define <8 x i32> @test_256_20(i8 * %addr, <8 x i32> %mask1) {
; CHECK-LABEL: test_256_20:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT: vpcmpneqd %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc9,0x04]
+; CHECK-NEXT: vptestmd %ymm0, %ymm0, %k1 ## encoding: [0x62,0xf2,0x7d,0x28,0x27,0xc8]
; CHECK-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -220,8 +216,7 @@ define <8 x i32> @test_256_20(i8 * %addr, <8 x i32> %mask1) {
define <4 x i64> @test_256_21(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) {
; CHECK-LABEL: test_256_21:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT: vptestmq %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x28,0x27,0xc9]
; CHECK-NEXT: vmovdqa64 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -234,8 +229,7 @@ define <4 x i64> @test_256_21(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) {
define <4 x i64> @test_256_22(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) {
; CHECK-LABEL: test_256_22:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT: vptestmq %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x28,0x27,0xc9]
; CHECK-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x29,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -248,8 +242,7 @@ define <4 x i64> @test_256_22(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) {
define <4 x i64> @test_256_23(i8 * %addr, <4 x i64> %mask1) {
; CHECK-LABEL: test_256_23:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc9,0x04]
+; CHECK-NEXT: vptestmq %ymm0, %ymm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x28,0x27,0xc8]
; CHECK-NEXT: vmovdqa64 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -262,8 +255,7 @@ define <4 x i64> @test_256_23(i8 * %addr, <4 x i64> %mask1) {
define <4 x i64> @test_256_24(i8 * %addr, <4 x i64> %mask1) {
; CHECK-LABEL: test_256_24:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc9,0x04]
+; CHECK-NEXT: vptestmq %ymm0, %ymm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x28,0x27,0xc8]
; CHECK-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfe,0xa9,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -332,8 +324,7 @@ define <8 x float> @test_256_28(i8 * %addr, <8 x float> %mask1) {
define <4 x double> @test_256_29(i8 * %addr, <4 x double> %old, <4 x i64> %mask1) {
; CHECK-LABEL: test_256_29:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT: vptestmq %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x28,0x27,0xc9]
; CHECK-NEXT: vmovapd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -346,8 +337,7 @@ define <4 x double> @test_256_29(i8 * %addr, <4 x double> %old, <4 x i64> %mask1
define <4 x double> @test_256_30(i8 * %addr, <4 x double> %old, <4 x i64> %mask1) {
; CHECK-LABEL: test_256_30:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT: vptestmq %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x28,0x27,0xc9]
; CHECK-NEXT: vmovupd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -360,8 +350,7 @@ define <4 x double> @test_256_30(i8 * %addr, <4 x double> %old, <4 x i64> %mask1
define <4 x double> @test_256_31(i8 * %addr, <4 x i64> %mask1) {
; CHECK-LABEL: test_256_31:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc9,0x04]
+; CHECK-NEXT: vptestmq %ymm0, %ymm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x28,0x27,0xc8]
; CHECK-NEXT: vmovapd (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -374,8 +363,7 @@ define <4 x double> @test_256_31(i8 * %addr, <4 x i64> %mask1) {
define <4 x double> @test_256_32(i8 * %addr, <4 x i64> %mask1) {
; CHECK-LABEL: test_256_32:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc9,0x04]
+; CHECK-NEXT: vptestmq %ymm0, %ymm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x28,0x27,0xc8]
; CHECK-NEXT: vmovupd (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -548,8 +536,7 @@ define <4 x float> @test_128_16(i8 * %addr) {
define <4 x i32> @test_128_17(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) {
; CHECK-LABEL: test_128_17:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x08,0x27,0xc9]
; CHECK-NEXT: vmovdqa32 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -562,8 +549,7 @@ define <4 x i32> @test_128_17(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) {
define <4 x i32> @test_128_18(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) {
; CHECK-LABEL: test_128_18:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x08,0x27,0xc9]
; CHECK-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -576,8 +562,7 @@ define <4 x i32> @test_128_18(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) {
define <4 x i32> @test_128_19(i8 * %addr, <4 x i32> %mask1) {
; CHECK-LABEL: test_128_19:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc9,0x04]
+; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1 ## encoding: [0x62,0xf2,0x7d,0x08,0x27,0xc8]
; CHECK-NEXT: vmovdqa32 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -590,8 +575,7 @@ define <4 x i32> @test_128_19(i8 * %addr, <4 x i32> %mask1) {
define <4 x i32> @test_128_20(i8 * %addr, <4 x i32> %mask1) {
; CHECK-LABEL: test_128_20:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc9,0x04]
+; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1 ## encoding: [0x62,0xf2,0x7d,0x08,0x27,0xc8]
; CHECK-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -604,8 +588,7 @@ define <4 x i32> @test_128_20(i8 * %addr, <4 x i32> %mask1) {
define <2 x i64> @test_128_21(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) {
; CHECK-LABEL: test_128_21:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT: vptestmq %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x08,0x27,0xc9]
; CHECK-NEXT: vmovdqa64 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
@@ -618,8 +601,7 @@ define <2 x i64> @test_128_21(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) {
define <2 x i64> @test_128_22(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) {
; CHECK-LABEL: test_128_22:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT: vptestmq %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x08,0x27,0xc9]
; CHECK-NEXT: vmovdqu64 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x09,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
@@ -632,8 +614,7 @@ define <2 x i64> @test_128_22(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) {
define <2 x i64> @test_128_23(i8 * %addr, <2 x i64> %mask1) {
; CHECK-LABEL: test_128_23:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc9,0x04]
+; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x08,0x27,0xc8]
; CHECK-NEXT: vmovdqa64 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
@@ -646,8 +627,7 @@ define <2 x i64> @test_128_23(i8 * %addr, <2 x i64> %mask1) {
define <2 x i64> @test_128_24(i8 * %addr, <2 x i64> %mask1) {
; CHECK-LABEL: test_128_24:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc9,0x04]
+; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x08,0x27,0xc8]
; CHECK-NEXT: vmovdqu64 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfe,0x89,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
@@ -660,8 +640,7 @@ define <2 x i64> @test_128_24(i8 * %addr, <2 x i64> %mask1) {
define <4 x float> @test_128_25(i8 * %addr, <4 x float> %old, <4 x i32> %mask1) {
; CHECK-LABEL: test_128_25:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x08,0x27,0xc9]
; CHECK-NEXT: vmovaps (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -674,8 +653,7 @@ define <4 x float> @test_128_25(i8 * %addr, <4 x float> %old, <4 x i32> %mask1)
define <4 x float> @test_128_26(i8 * %addr, <4 x float> %old, <4 x i32> %mask1) {
; CHECK-LABEL: test_128_26:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x08,0x27,0xc9]
; CHECK-NEXT: vmovups (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -688,8 +666,7 @@ define <4 x float> @test_128_26(i8 * %addr, <4 x float> %old, <4 x i32> %mask1)
define <4 x float> @test_128_27(i8 * %addr, <4 x i32> %mask1) {
; CHECK-LABEL: test_128_27:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc9,0x04]
+; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1 ## encoding: [0x62,0xf2,0x7d,0x08,0x27,0xc8]
; CHECK-NEXT: vmovaps (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -702,8 +679,7 @@ define <4 x float> @test_128_27(i8 * %addr, <4 x i32> %mask1) {
define <4 x float> @test_128_28(i8 * %addr, <4 x i32> %mask1) {
; CHECK-LABEL: test_128_28:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc9,0x04]
+; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1 ## encoding: [0x62,0xf2,0x7d,0x08,0x27,0xc8]
; CHECK-NEXT: vmovups (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -716,8 +692,7 @@ define <4 x float> @test_128_28(i8 * %addr, <4 x i32> %mask1) {
define <2 x double> @test_128_29(i8 * %addr, <2 x double> %old, <2 x i64> %mask1) {
; CHECK-LABEL: test_128_29:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT: vptestmq %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x08,0x27,0xc9]
; CHECK-NEXT: vmovapd (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
@@ -730,8 +705,7 @@ define <2 x double> @test_128_29(i8 * %addr, <2 x double> %old, <2 x i64> %mask1
define <2 x double> @test_128_30(i8 * %addr, <2 x double> %old, <2 x i64> %mask1) {
; CHECK-LABEL: test_128_30:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT: vptestmq %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x08,0x27,0xc9]
; CHECK-NEXT: vmovupd (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
@@ -744,8 +718,7 @@ define <2 x double> @test_128_30(i8 * %addr, <2 x double> %old, <2 x i64> %mask1
define <2 x double> @test_128_31(i8 * %addr, <2 x i64> %mask1) {
; CHECK-LABEL: test_128_31:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc9,0x04]
+; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x08,0x27,0xc8]
; CHECK-NEXT: vmovapd (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
@@ -758,8 +731,7 @@ define <2 x double> @test_128_31(i8 * %addr, <2 x i64> %mask1) {
define <2 x double> @test_128_32(i8 * %addr, <2 x i64> %mask1) {
; CHECK-LABEL: test_128_32:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc9,0x04]
+; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x08,0x27,0xc8]
; CHECK-NEXT: vmovupd (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
diff --git a/llvm/test/CodeGen/X86/avx512vl-vbroadcast.ll b/llvm/test/CodeGen/X86/avx512vl-vbroadcast.ll
index 7d24b8161e5..97fa973127b 100644
--- a/llvm/test/CodeGen/X86/avx512vl-vbroadcast.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-vbroadcast.ll
@@ -70,8 +70,7 @@ define <8 x float> @_inreg8xfloat(float %a) {
define <8 x float> @_ss8xfloat_mask(<8 x float> %i, float %a, <8 x i32> %mask1) {
; CHECK-LABEL: _ss8xfloat_mask:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpneqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vptestmd %ymm2, %ymm2, %k1
; CHECK-NEXT: vbroadcastss %xmm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -84,8 +83,7 @@ define <8 x float> @_ss8xfloat_mask(<8 x float> %i, float %a, <8 x i32> %mask1
define <8 x float> @_ss8xfloat_maskz(float %a, <8 x i32> %mask1) {
; CHECK-LABEL: _ss8xfloat_maskz:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpneqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vptestmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -108,8 +106,7 @@ define <4 x float> @_inreg4xfloat(float %a) {
define <4 x float> @_ss4xfloat_mask(<4 x float> %i, float %a, <4 x i32> %mask1) {
; CHECK-LABEL: _ss4xfloat_mask:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpneqd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vptestmd %xmm2, %xmm2, %k1
; CHECK-NEXT: vbroadcastss %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -122,8 +119,7 @@ define <4 x float> @_ss4xfloat_mask(<4 x float> %i, float %a, <4 x i32> %mask1
define <4 x float> @_ss4xfloat_maskz(float %a, <4 x i32> %mask1) {
; CHECK-LABEL: _ss4xfloat_maskz:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -146,8 +142,7 @@ define <4 x double> @_inreg4xdouble(double %a) {
define <4 x double> @_ss4xdouble_mask(<4 x double> %i, double %a, <4 x i32> %mask1) {
; CHECK-LABEL: _ss4xdouble_mask:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpneqd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vptestmd %xmm2, %xmm2, %k1
; CHECK-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -160,8 +155,7 @@ define <4 x double> @_ss4xdouble_mask(<4 x double> %i, double %a, <4 x i32> %m
define <4 x double> @_ss4xdouble_maskz(double %a, <4 x i32> %mask1) {
; CHECK-LABEL: _ss4xdouble_maskz:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -185,8 +179,7 @@ define <2 x double> @test_v2f64_broadcast_fold(<2 x double> *%a0, <2 x double> %
define <2 x double> @test_v2f64_broadcast_fold_mask(<2 x double> *%a0, <2 x double> %a1, <2 x i64> %mask1, <2 x double> %a2) {
; CHECK-LABEL: test_v2f64_broadcast_fold_mask:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpneqq %xmm3, %xmm1, %k1
+; CHECK-NEXT: vptestmq %xmm1, %xmm1, %k1
; CHECK-NEXT: vaddpd (%rdi){1to2}, %xmm0, %xmm2 {%k1}
; CHECK-NEXT: vmovapd %xmm2, %xmm0
; CHECK-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/compress_expand.ll b/llvm/test/CodeGen/X86/compress_expand.ll
index fb550be6310..57767e23e3d 100644
--- a/llvm/test/CodeGen/X86/compress_expand.ll
+++ b/llvm/test/CodeGen/X86/compress_expand.ll
@@ -256,7 +256,7 @@ define <2 x float> @test13(float* %base, <2 x float> %src0, <2 x i32> %trigger)
; SKX: # %bb.0:
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k1
+; SKX-NEXT: vptestnmq %xmm1, %xmm1, %k1
; SKX-NEXT: vexpandps (%rdi), %xmm0 {%k1}
; SKX-NEXT: retq
;
@@ -265,7 +265,7 @@ define <2 x float> @test13(float* %base, <2 x float> %src0, <2 x i32> %trigger)
; KNL-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; KNL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; KNL-NEXT: vpcmpeqq %zmm2, %zmm1, %k0
+; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0
; KNL-NEXT: kshiftlw $14, %k0, %k0
; KNL-NEXT: kshiftrw $14, %k0, %k1
; KNL-NEXT: vexpandps (%rdi), %zmm0 {%k1}
@@ -281,7 +281,7 @@ define void @test14(float* %base, <2 x float> %V, <2 x i32> %trigger) {
; SKX: # %bb.0:
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k1
+; SKX-NEXT: vptestnmq %xmm1, %xmm1, %k1
; SKX-NEXT: vcompressps %xmm0, (%rdi) {%k1}
; SKX-NEXT: retq
;
@@ -290,7 +290,7 @@ define void @test14(float* %base, <2 x float> %V, <2 x i32> %trigger) {
; KNL-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; KNL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; KNL-NEXT: vpcmpeqq %zmm2, %zmm1, %k0
+; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0
; KNL-NEXT: kshiftlw $14, %k0, %k0
; KNL-NEXT: kshiftrw $14, %k0, %k1
; KNL-NEXT: vcompressps %zmm0, (%rdi) {%k1}
@@ -303,9 +303,8 @@ define void @test14(float* %base, <2 x float> %V, <2 x i32> %trigger) {
define <32 x float> @test15(float* %base, <32 x float> %src0, <32 x i32> %trigger) {
; ALL-LABEL: test15:
; ALL: # %bb.0:
-; ALL-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; ALL-NEXT: vpcmpeqd %zmm4, %zmm3, %k1
-; ALL-NEXT: vpcmpeqd %zmm4, %zmm2, %k2
+; ALL-NEXT: vptestnmd %zmm3, %zmm3, %k1
+; ALL-NEXT: vptestnmd %zmm2, %zmm2, %k2
; ALL-NEXT: kmovw %k2, %eax
; ALL-NEXT: popcntl %eax, %eax
; ALL-NEXT: vexpandps (%rdi,%rax,4), %zmm1 {%k1}
@@ -320,9 +319,8 @@ define <16 x double> @test16(double* %base, <16 x double> %src0, <16 x i32> %tri
; SKX-LABEL: test16:
; SKX: # %bb.0:
; SKX-NEXT: vextracti64x4 $1, %zmm2, %ymm3
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k2
+; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k2
; SKX-NEXT: kmovb %k2, %eax
; SKX-NEXT: popcntl %eax, %eax
; SKX-NEXT: vexpandpd (%rdi,%rax,8), %zmm1 {%k1}
@@ -331,10 +329,9 @@ define <16 x double> @test16(double* %base, <16 x double> %src0, <16 x i32> %tri
;
; KNL-LABEL: test16:
; KNL: # %bb.0:
-; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; KNL-NEXT: vextracti64x4 $1, %zmm2, %ymm4
-; KNL-NEXT: vpcmpeqd %zmm3, %zmm4, %k1
-; KNL-NEXT: vpcmpeqd %zmm3, %zmm2, %k2
+; KNL-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; KNL-NEXT: vptestnmd %zmm3, %zmm3, %k1
+; KNL-NEXT: vptestnmd %zmm2, %zmm2, %k2
; KNL-NEXT: vexpandpd (%rdi), %zmm0 {%k2}
; KNL-NEXT: kmovw %k2, %eax
; KNL-NEXT: movzbl %al, %eax
@@ -349,9 +346,8 @@ define <16 x double> @test16(double* %base, <16 x double> %src0, <16 x i32> %tri
define void @test17(float* %base, <32 x float> %V, <32 x i32> %trigger) {
; SKX-LABEL: test17:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k2
+; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k2
; SKX-NEXT: kmovw %k2, %eax
; SKX-NEXT: popcntl %eax, %eax
; SKX-NEXT: vcompressps %zmm1, (%rdi,%rax,4) {%k1}
@@ -361,9 +357,8 @@ define void @test17(float* %base, <32 x float> %V, <32 x i32> %trigger) {
;
; KNL-LABEL: test17:
; KNL: # %bb.0:
-; KNL-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; KNL-NEXT: vpcmpeqd %zmm4, %zmm3, %k1
-; KNL-NEXT: vpcmpeqd %zmm4, %zmm2, %k2
+; KNL-NEXT: vptestnmd %zmm3, %zmm3, %k1
+; KNL-NEXT: vptestnmd %zmm2, %zmm2, %k2
; KNL-NEXT: kmovw %k2, %eax
; KNL-NEXT: popcntl %eax, %eax
; KNL-NEXT: vcompressps %zmm1, (%rdi,%rax,4) {%k1}
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index 941fdc6c15b..574f271e4a4 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -2769,10 +2769,9 @@ define <16 x double> @test_gather_setcc_split(double* %base, <16 x i32> %ind, <1
; KNL_64-LABEL: test_gather_setcc_split:
; KNL_64: # %bb.0:
; KNL_64-NEXT: vextractf64x4 $1, %zmm0, %ymm4
-; KNL_64-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; KNL_64-NEXT: vextracti64x4 $1, %zmm1, %ymm6
-; KNL_64-NEXT: vpcmpeqd %zmm5, %zmm6, %k1
-; KNL_64-NEXT: vpcmpeqd %zmm5, %zmm1, %k2
+; KNL_64-NEXT: vextracti64x4 $1, %zmm1, %ymm5
+; KNL_64-NEXT: vptestnmd %zmm5, %zmm5, %k1
+; KNL_64-NEXT: vptestnmd %zmm1, %zmm1, %k2
; KNL_64-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k2}
; KNL_64-NEXT: vgatherdpd (%rdi,%ymm4,8), %zmm3 {%k1}
; KNL_64-NEXT: vmovapd %zmm2, %zmm0
@@ -2791,10 +2790,9 @@ define <16 x double> @test_gather_setcc_split(double* %base, <16 x i32> %ind, <1
; KNL_32-NEXT: vmovapd 72(%ebp), %zmm3
; KNL_32-NEXT: movl 8(%ebp), %eax
; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm4
-; KNL_32-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; KNL_32-NEXT: vextracti64x4 $1, %zmm1, %ymm6
-; KNL_32-NEXT: vpcmpeqd %zmm5, %zmm6, %k1
-; KNL_32-NEXT: vpcmpeqd %zmm5, %zmm1, %k2
+; KNL_32-NEXT: vextracti64x4 $1, %zmm1, %ymm5
+; KNL_32-NEXT: vptestnmd %zmm5, %zmm5, %k1
+; KNL_32-NEXT: vptestnmd %zmm1, %zmm1, %k2
; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k2}
; KNL_32-NEXT: vgatherdpd (%eax,%ymm4,8), %zmm3 {%k1}
; KNL_32-NEXT: vmovapd %zmm2, %zmm0
@@ -2807,9 +2805,8 @@ define <16 x double> @test_gather_setcc_split(double* %base, <16 x i32> %ind, <1
; SKX: # %bb.0:
; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm4
; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm5
-; SKX-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; SKX-NEXT: vpcmpeqd %ymm6, %ymm5, %k1
-; SKX-NEXT: vpcmpeqd %ymm6, %ymm1, %k2
+; SKX-NEXT: vptestnmd %ymm5, %ymm5, %k1
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k2
; SKX-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k2}
; SKX-NEXT: vgatherdpd (%rdi,%ymm4,8), %zmm3 {%k1}
; SKX-NEXT: vmovapd %zmm2, %zmm0
@@ -2829,9 +2826,8 @@ define <16 x double> @test_gather_setcc_split(double* %base, <16 x i32> %ind, <1
; SKX_32-NEXT: movl 8(%ebp), %eax
; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm4
; SKX_32-NEXT: vextracti64x4 $1, %zmm1, %ymm5
-; SKX_32-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; SKX_32-NEXT: vpcmpeqd %ymm6, %ymm5, %k1
-; SKX_32-NEXT: vpcmpeqd %ymm6, %ymm1, %k2
+; SKX_32-NEXT: vptestnmd %ymm5, %ymm5, %k1
+; SKX_32-NEXT: vptestnmd %ymm1, %ymm1, %k2
; SKX_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k2}
; SKX_32-NEXT: vgatherdpd (%eax,%ymm4,8), %zmm3 {%k1}
; SKX_32-NEXT: vmovapd %zmm2, %zmm0
@@ -2851,10 +2847,9 @@ define void @test_scatter_setcc_split(double* %base, <16 x i32> %ind, <16 x i32>
; KNL_64-LABEL: test_scatter_setcc_split:
; KNL_64: # %bb.0:
; KNL_64-NEXT: vextractf64x4 $1, %zmm0, %ymm4
-; KNL_64-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; KNL_64-NEXT: vpcmpeqd %zmm5, %zmm1, %k1
+; KNL_64-NEXT: vptestnmd %zmm1, %zmm1, %k1
; KNL_64-NEXT: vextracti64x4 $1, %zmm1, %ymm1
-; KNL_64-NEXT: vpcmpeqd %zmm5, %zmm1, %k2
+; KNL_64-NEXT: vptestnmd %zmm1, %zmm1, %k2
; KNL_64-NEXT: vscatterdpd %zmm3, (%rdi,%ymm4,8) {%k2}
; KNL_64-NEXT: vscatterdpd %zmm2, (%rdi,%ymm0,8) {%k1}
; KNL_64-NEXT: vzeroupper
@@ -2872,10 +2867,9 @@ define void @test_scatter_setcc_split(double* %base, <16 x i32> %ind, <16 x i32>
; KNL_32-NEXT: vmovapd 72(%ebp), %zmm3
; KNL_32-NEXT: movl 8(%ebp), %eax
; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm4
-; KNL_32-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; KNL_32-NEXT: vpcmpeqd %zmm5, %zmm1, %k1
+; KNL_32-NEXT: vptestnmd %zmm1, %zmm1, %k1
; KNL_32-NEXT: vextracti64x4 $1, %zmm1, %ymm1
-; KNL_32-NEXT: vpcmpeqd %zmm5, %zmm1, %k2
+; KNL_32-NEXT: vptestnmd %zmm1, %zmm1, %k2
; KNL_32-NEXT: vscatterdpd %zmm3, (%eax,%ymm4,8) {%k2}
; KNL_32-NEXT: vscatterdpd %zmm2, (%eax,%ymm0,8) {%k1}
; KNL_32-NEXT: movl %ebp, %esp
@@ -2886,10 +2880,9 @@ define void @test_scatter_setcc_split(double* %base, <16 x i32> %ind, <16 x i32>
; SKX-LABEL: test_scatter_setcc_split:
; SKX: # %bb.0:
; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm4
-; SKX-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; SKX-NEXT: vpcmpeqd %ymm5, %ymm1, %k1
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1
; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm1
-; SKX-NEXT: vpcmpeqd %ymm5, %ymm1, %k2
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k2
; SKX-NEXT: vscatterdpd %zmm3, (%rdi,%ymm4,8) {%k2}
; SKX-NEXT: vscatterdpd %zmm2, (%rdi,%ymm0,8) {%k1}
; SKX-NEXT: vzeroupper
@@ -2907,10 +2900,9 @@ define void @test_scatter_setcc_split(double* %base, <16 x i32> %ind, <16 x i32>
; SKX_32-NEXT: vmovapd 72(%ebp), %zmm3
; SKX_32-NEXT: movl 8(%ebp), %eax
; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm4
-; SKX_32-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; SKX_32-NEXT: vpcmpeqd %ymm5, %ymm1, %k1
+; SKX_32-NEXT: vptestnmd %ymm1, %ymm1, %k1
; SKX_32-NEXT: vextracti64x4 $1, %zmm1, %ymm1
-; SKX_32-NEXT: vpcmpeqd %ymm5, %ymm1, %k2
+; SKX_32-NEXT: vptestnmd %ymm1, %ymm1, %k2
; SKX_32-NEXT: vscatterdpd %zmm3, (%eax,%ymm4,8) {%k2}
; SKX_32-NEXT: vscatterdpd %zmm2, (%eax,%ymm0,8) {%k1}
; SKX_32-NEXT: movl %ebp, %esp
diff --git a/llvm/test/CodeGen/X86/masked_memop.ll b/llvm/test/CodeGen/X86/masked_memop.ll
index 80dabcdd123..cd28147878c 100644
--- a/llvm/test/CodeGen/X86/masked_memop.ll
+++ b/llvm/test/CodeGen/X86/masked_memop.ll
@@ -101,8 +101,7 @@ define <2 x double> @test6(<2 x i64> %trigger, <2 x double>* %addr, <2 x double>
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def %xmm1 killed %xmm1 def %zmm1
; AVX512F-NEXT: ## kill: def %xmm0 killed %xmm0 def %zmm0
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k0
+; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kshiftlw $14, %k0, %k0
; AVX512F-NEXT: kshiftrw $14, %k0, %k1
; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
@@ -112,8 +111,7 @@ define <2 x double> @test6(<2 x i64> %trigger, <2 x double>* %addr, <2 x double>
;
; SKX-LABEL: test6:
; SKX: ## %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1
+; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1
; SKX-NEXT: vblendmpd (%rdi), %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
%mask = icmp eq <2 x i64> %trigger, zeroinitializer
@@ -134,8 +132,7 @@ define <4 x float> @test7(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %d
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def %xmm1 killed %xmm1 def %zmm1
; AVX512F-NEXT: ## kill: def %xmm0 killed %xmm0 def %zmm0
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k0
+; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kshiftlw $12, %k0, %k0
; AVX512F-NEXT: kshiftrw $12, %k0, %k1
; AVX512F-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1}
@@ -145,8 +142,7 @@ define <4 x float> @test7(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %d
;
; SKX-LABEL: test7:
; SKX: ## %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
+; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1
; SKX-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
@@ -175,8 +171,7 @@ define <4 x i32> @test8(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) {
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def %xmm1 killed %xmm1 def %zmm1
; AVX512F-NEXT: ## kill: def %xmm0 killed %xmm0 def %zmm0
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k0
+; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kshiftlw $12, %k0, %k0
; AVX512F-NEXT: kshiftrw $12, %k0, %k1
; AVX512F-NEXT: vpblendmd (%rdi), %zmm1, %zmm0 {%k1}
@@ -186,8 +181,7 @@ define <4 x i32> @test8(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) {
;
; SKX-LABEL: test8:
; SKX: ## %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
+; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1
; SKX-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
@@ -214,8 +208,7 @@ define void @test9(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def %xmm1 killed %xmm1 def %zmm1
; AVX512F-NEXT: ## kill: def %xmm0 killed %xmm0 def %zmm0
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k0
+; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kshiftlw $12, %k0, %k0
; AVX512F-NEXT: kshiftrw $12, %k0, %k1
; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1}
@@ -224,8 +217,7 @@ define void @test9(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
;
; SKX-LABEL: test9:
; SKX: ## %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
+; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1
; SKX-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1}
; SKX-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
@@ -259,8 +251,7 @@ define <4 x double> @test10(<4 x i32> %trigger, <4 x double>* %addr, <4 x double
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def %ymm1 killed %ymm1 def %zmm1
; AVX512F-NEXT: ## kill: def %xmm0 killed %xmm0 def %zmm0
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k0
+; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kshiftlw $12, %k0, %k0
; AVX512F-NEXT: kshiftrw $12, %k0, %k1
; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
@@ -269,8 +260,7 @@ define <4 x double> @test10(<4 x i32> %trigger, <4 x double>* %addr, <4 x double
;
; SKX-LABEL: test10:
; SKX: ## %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
+; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1
; SKX-NEXT: vblendmpd (%rdi), %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
@@ -301,8 +291,7 @@ define <4 x double> @test10b(<4 x i32> %trigger, <4 x double>* %addr, <4 x doubl
; AVX512F-LABEL: test10b:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def %xmm0 killed %xmm0 def %zmm0
-; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kshiftlw $12, %k0, %k0
; AVX512F-NEXT: kshiftrw $12, %k0, %k1
; AVX512F-NEXT: vmovupd (%rdi), %zmm0 {%k1} {z}
@@ -311,8 +300,7 @@ define <4 x double> @test10b(<4 x i32> %trigger, <4 x double>* %addr, <4 x doubl
;
; SKX-LABEL: test10b:
; SKX: ## %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1
; SKX-NEXT: vmovapd (%rdi), %ymm0 {%k1} {z}
; SKX-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
@@ -344,8 +332,7 @@ define <8 x float> @test11a(<8 x i32> %trigger, <8 x float>* %addr, <8 x float>
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def %ymm1 killed %ymm1 def %zmm1
; AVX512F-NEXT: ## kill: def %ymm0 killed %ymm0 def %zmm0
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k0
+; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kshiftlw $8, %k0, %k0
; AVX512F-NEXT: kshiftrw $8, %k0, %k1
; AVX512F-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1}
@@ -354,8 +341,7 @@ define <8 x float> @test11a(<8 x i32> %trigger, <8 x float>* %addr, <8 x float>
;
; SKX-LABEL: test11a:
; SKX: ## %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
+; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1
; SKX-NEXT: vblendmps (%rdi), %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
%mask = icmp eq <8 x i32> %trigger, zeroinitializer
@@ -510,8 +496,7 @@ define void @test12(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %val) {
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def %ymm1 killed %ymm1 def %zmm1
; AVX512F-NEXT: ## kill: def %ymm0 killed %ymm0 def %zmm0
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k0
+; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kshiftlw $8, %k0, %k0
; AVX512F-NEXT: kshiftrw $8, %k0, %k1
; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1}
@@ -520,8 +505,7 @@ define void @test12(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %val) {
;
; SKX-LABEL: test12:
; SKX: ## %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
+; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1
; SKX-NEXT: vmovdqu32 %ymm1, (%rdi) {%k1}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -554,7 +538,7 @@ define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
; AVX512F-NEXT: ## kill: def %xmm1 killed %xmm1 def %zmm1
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k0
+; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kshiftlw $14, %k0, %k0
; AVX512F-NEXT: kshiftrw $14, %k0, %k1
; AVX512F-NEXT: vmovups %zmm1, (%rdi) {%k1}
@@ -565,7 +549,7 @@ define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
; SKX: ## %bb.0:
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1
+; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1
; SKX-NEXT: vmovups %xmm1, (%rdi) {%k1}
; SKX-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -598,10 +582,10 @@ define void @test15(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k0
+; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; AVX512F-NEXT: kshiftlw $14, %k0, %k0
; AVX512F-NEXT: kshiftrw $14, %k0, %k1
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1}
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
@@ -610,7 +594,7 @@ define void @test15(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
; SKX: ## %bb.0:
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1
+; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovqd %xmm1, (%rdi) {%k1}
; SKX-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -644,7 +628,7 @@ define <2 x float> @test16(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %
; AVX512F-NEXT: ## kill: def %xmm1 killed %xmm1 def %zmm1
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k0
+; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kshiftlw $14, %k0, %k0
; AVX512F-NEXT: kshiftrw $14, %k0, %k1
; AVX512F-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1}
@@ -656,7 +640,7 @@ define <2 x float> @test16(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %
; SKX: ## %bb.0:
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1
+; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1
; SKX-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -693,10 +677,10 @@ define <2 x i32> @test17(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k0
+; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; AVX512F-NEXT: kshiftlw $14, %k0, %k0
; AVX512F-NEXT: kshiftrw $14, %k0, %k1
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1}
; AVX512F-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX512F-NEXT: vzeroupper
@@ -706,7 +690,7 @@ define <2 x i32> @test17(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
; SKX: ## %bb.0:
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1
+; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1
; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; SKX-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1}
; SKX-NEXT: vpmovsxdq %xmm0, %xmm0
@@ -739,7 +723,7 @@ define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) {
; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX512F-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
+; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kshiftlw $14, %k0, %k0
; AVX512F-NEXT: kshiftrw $14, %k0, %k1
; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z}
@@ -751,7 +735,7 @@ define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) {
; SKX: ## %bb.0:
; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %k1
+; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1
; SKX-NEXT: vmovups (%rdi), %xmm0 {%k1} {z}
; SKX-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
diff --git a/llvm/test/CodeGen/X86/nontemporal-loads.ll b/llvm/test/CodeGen/X86/nontemporal-loads.ll
index 308395d365c..37ff7115ac9 100644
--- a/llvm/test/CodeGen/X86/nontemporal-loads.ll
+++ b/llvm/test/CodeGen/X86/nontemporal-loads.ll
@@ -1900,8 +1900,7 @@ define <16 x i32> @test_masked_v16i32(i8 * %addr, <16 x i32> %old, <16 x i32> %m
;
; AVX512-LABEL: test_masked_v16i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
+; AVX512-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512-NEXT: vmovntdqa (%rdi), %zmm1
; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/pr35272.ll b/llvm/test/CodeGen/X86/pr35272.ll
index 0df1d7cb83c..0b832d56310 100644
--- a/llvm/test/CodeGen/X86/pr35272.ll
+++ b/llvm/test/CodeGen/X86/pr35272.ll
@@ -4,8 +4,7 @@
define <2 x i48> @PR35272(<2 x i64> %a0, <2 x i48> %a1, <2 x i48> %a2) {
; CHECK-LABEL: PR35272:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %xmm3, %xmm0, %k1
+; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
; CHECK-NEXT: vpblendmq %xmm1, %xmm2, %xmm0 {%k1}
; CHECK-NEXT: retq
%1 = icmp eq <2 x i64> %a0, zeroinitializer
diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll b/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll
index 00d3a5c67dc..2745055da99 100644
--- a/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll
+++ b/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll
@@ -7,8 +7,8 @@
define <8 x i16> @testv8i1_sext_v8i16(<8 x i32>* %p) {
; AVX256-LABEL: testv8i1_sext_v8i16:
; AVX256: # %bb.0:
-; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
+; AVX256-NEXT: vmovdqa (%rdi), %ymm0
+; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1
; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX256-NEXT: vpmovdw %ymm0, %xmm0
@@ -17,8 +17,8 @@ define <8 x i16> @testv8i1_sext_v8i16(<8 x i32>* %p) {
;
; AVX512VL-LABEL: testv8i1_sext_v8i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
@@ -42,9 +42,10 @@ define <8 x i16> @testv8i1_sext_v8i16(<8 x i32>* %p) {
define <16 x i8> @testv16i1_sext_v16i8(<8 x i32>* %p, <8 x i32>* %q) {
; AVX256-LABEL: testv16i1_sext_v16i8:
; AVX256: # %bb.0:
-; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
-; AVX256-NEXT: vpcmpeqd (%rsi), %ymm0, %k2
+; AVX256-NEXT: vmovdqa (%rdi), %ymm0
+; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1
+; AVX256-NEXT: vmovdqa (%rsi), %ymm0
+; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2
; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z}
; AVX256-NEXT: vpmovdw %ymm1, %xmm1
@@ -58,9 +59,10 @@ define <16 x i8> @testv16i1_sext_v16i8(<8 x i32>* %p, <8 x i32>* %q) {
;
; AVX512VL-LABEL: testv16i1_sext_v16i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
-; AVX512VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k1
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0
+; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
@@ -70,10 +72,9 @@ define <16 x i8> @testv16i1_sext_v16i8(<8 x i32>* %p, <8 x i32>* %q) {
; AVX512F-LABEL: testv16i1_sext_v16i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
-; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1
; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
@@ -91,9 +92,10 @@ define <16 x i8> @testv16i1_sext_v16i8(<8 x i32>* %p, <8 x i32>* %q) {
define <16 x i16> @testv16i1_sext_v16i16(<8 x i32>* %p, <8 x i32>* %q) {
; AVX256-LABEL: testv16i1_sext_v16i16:
; AVX256: # %bb.0:
-; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
-; AVX256-NEXT: vpcmpeqd (%rsi), %ymm0, %k2
+; AVX256-NEXT: vmovdqa (%rdi), %ymm0
+; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1
+; AVX256-NEXT: vmovdqa (%rsi), %ymm0
+; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2
; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
; AVX256-NEXT: vpmovdw %ymm1, %xmm1
@@ -104,9 +106,10 @@ define <16 x i16> @testv16i1_sext_v16i16(<8 x i32>* %p, <8 x i32>* %q) {
;
; AVX512VL-LABEL: testv16i1_sext_v16i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
-; AVX512VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k1
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0
+; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
@@ -115,10 +118,9 @@ define <16 x i16> @testv16i1_sext_v16i16(<8 x i32>* %p, <8 x i32>* %q) {
; AVX512F-LABEL: testv16i1_sext_v16i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
-; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1
; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
@@ -135,8 +137,8 @@ define <16 x i16> @testv16i1_sext_v16i16(<8 x i32>* %p, <8 x i32>* %q) {
define <8 x i16> @testv8i1_zext_v8i16(<8 x i32>* %p) {
; AVX256-LABEL: testv8i1_zext_v8i16:
; AVX256: # %bb.0:
-; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
+; AVX256-NEXT: vmovdqa (%rdi), %ymm0
+; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1
; AVX256-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
; AVX256-NEXT: vpmovdw %ymm0, %xmm0
; AVX256-NEXT: vzeroupper
@@ -144,8 +146,8 @@ define <8 x i16> @testv8i1_zext_v8i16(<8 x i32>* %p) {
;
; AVX512VL-LABEL: testv8i1_zext_v8i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
; AVX512VL-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
; AVX512VL-NEXT: vzeroupper
@@ -168,9 +170,10 @@ define <8 x i16> @testv8i1_zext_v8i16(<8 x i32>* %p) {
define <16 x i8> @testv16i1_zext_v16i8(<8 x i32>* %p, <8 x i32>* %q) {
; AVX256-LABEL: testv16i1_zext_v16i8:
; AVX256: # %bb.0:
-; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
-; AVX256-NEXT: vpcmpeqd (%rsi), %ymm0, %k2
+; AVX256-NEXT: vmovdqa (%rdi), %ymm0
+; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1
+; AVX256-NEXT: vmovdqa (%rsi), %ymm0
+; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2
; AVX256-NEXT: movl {{.*}}(%rip), %eax
; AVX256-NEXT: vpbroadcastd %eax, %ymm0 {%k2} {z}
; AVX256-NEXT: vpmovdw %ymm0, %xmm0
@@ -185,9 +188,10 @@ define <16 x i8> @testv16i1_zext_v16i8(<8 x i32>* %p, <8 x i32>* %q) {
;
; AVX512VL-LABEL: testv16i1_zext_v16i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
-; AVX512VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k1
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0
+; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1
; AVX512VL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
@@ -197,10 +201,9 @@ define <16 x i8> @testv16i1_zext_v16i8(<8 x i32>* %p, <8 x i32>* %q) {
; AVX512F-LABEL: testv16i1_zext_v16i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
-; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1
; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
@@ -218,9 +221,10 @@ define <16 x i8> @testv16i1_zext_v16i8(<8 x i32>* %p, <8 x i32>* %q) {
define <16 x i16> @testv16i1_zext_v16i16(<8 x i32>* %p, <8 x i32>* %q) {
; AVX256-LABEL: testv16i1_zext_v16i16:
; AVX256: # %bb.0:
-; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
-; AVX256-NEXT: vpcmpeqd (%rsi), %ymm0, %k2
+; AVX256-NEXT: vmovdqa (%rdi), %ymm0
+; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1
+; AVX256-NEXT: vmovdqa (%rsi), %ymm0
+; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2
; AVX256-NEXT: movl {{.*}}(%rip), %eax
; AVX256-NEXT: vpbroadcastd %eax, %ymm0 {%k1} {z}
; AVX256-NEXT: vpmovdw %ymm0, %xmm0
@@ -231,9 +235,10 @@ define <16 x i16> @testv16i1_zext_v16i16(<8 x i32>* %p, <8 x i32>* %q) {
;
; AVX512VL-LABEL: testv16i1_zext_v16i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
-; AVX512VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k1
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0
+; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1
; AVX512VL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
@@ -242,10 +247,9 @@ define <16 x i16> @testv16i1_zext_v16i16(<8 x i32>* %p, <8 x i32>* %q) {
; AVX512F-LABEL: testv16i1_zext_v16i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
-; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1
; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
index 39713666fcd..77d66bd9b08 100644
--- a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
+++ b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
@@ -11,9 +11,10 @@
define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<8 x i32>* %a, <8 x i32>* %b) {
; AVX256VL-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
; AVX256VL: # %bb.0:
-; AVX256VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX256VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
-; AVX256VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k2
+; AVX256VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX256VL-NEXT: vmovdqa (%rsi), %ymm1
+; AVX256VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
+; AVX256VL-NEXT: vptestnmd %ymm1, %ymm1, %k2
; AVX256VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z}
; AVX256VL-NEXT: vpmovdw %ymm1, %xmm1
@@ -44,9 +45,10 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<8 x i32>* %a, <8
;
; AVX512VL-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
-; AVX512VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k2
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vmovdqa (%rsi), %ymm1
+; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
+; AVX512VL-NEXT: vptestnmd %ymm1, %ymm1, %k2
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
@@ -59,9 +61,10 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<8 x i32>* %a, <8
;
; AVX256VLBW-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
; AVX256VLBW: # %bb.0:
-; AVX256VLBW-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX256VLBW-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
-; AVX256VLBW-NEXT: vpcmpeqd (%rsi), %ymm0, %k1
+; AVX256VLBW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX256VLBW-NEXT: vmovdqa (%rsi), %ymm1
+; AVX256VLBW-NEXT: vptestnmd %ymm0, %ymm0, %k0
+; AVX256VLBW-NEXT: vptestnmd %ymm1, %ymm1, %k1
; AVX256VLBW-NEXT: vpmovm2w %k1, %ymm0
; AVX256VLBW-NEXT: vpmovm2w %k0, %ymm1
; AVX256VLBW-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
@@ -73,9 +76,10 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<8 x i32>* %a, <8
;
; AVX512VLBW-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
-; AVX512VLBW-NEXT: vpcmpeqd (%rsi), %ymm0, %k2
+; AVX512VLBW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VLBW-NEXT: vmovdqa (%rsi), %ymm1
+; AVX512VLBW-NEXT: vptestnmd %ymm0, %ymm0, %k1
+; AVX512VLBW-NEXT: vptestnmd %ymm1, %ymm1, %k2
; AVX512VLBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512VLBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
@@ -89,9 +93,8 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<8 x i32>* %a, <8
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vmovdqa (%rsi), %ymm1
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
-; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm1, %k2
+; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
@@ -106,9 +109,8 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<8 x i32>* %a, <8
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vmovdqa (%rsi), %ymm1
-; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
-; AVX512BW-NEXT: vpcmpeqd %zmm2, %zmm1, %k2
+; AVX512BW-NEXT: vptestnmd %zmm0, %zmm0, %k1
+; AVX512BW-NEXT: vptestnmd %zmm1, %zmm1, %k2
; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
@@ -200,8 +202,7 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0
;
; AVX256VLBW-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
; AVX256VLBW: # %bb.0:
-; AVX256VLBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX256VLBW-NEXT: vpcmpeqb %ymm1, %ymm0, %k0
+; AVX256VLBW-NEXT: vptestnmb %ymm0, %ymm0, %k0
; AVX256VLBW-NEXT: vpmovm2b %k0, %ymm0
; AVX256VLBW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX256VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u]
@@ -214,8 +215,7 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0
;
; AVX512VLBW-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VLBW-NEXT: vpcmpeqb %ymm1, %ymm0, %k0
+; AVX512VLBW-NEXT: vptestnmb %ymm0, %ymm0, %k0
; AVX512VLBW-NEXT: vpmovm2w %k0, %zmm0
; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
; AVX512VLBW-NEXT: vpermw %zmm0, %zmm1, %zmm0
@@ -226,8 +226,7 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0
; AVX512BW-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
-; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0
; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll
index 2f62d92664a..10db0aeb25e 100644
--- a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll
+++ b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll
@@ -370,8 +370,7 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512BW-NEXT: vpandq %zmm1, %zmm4, %zmm1
; AVX512BW-NEXT: vpshufb %zmm1, %zmm3, %zmm1
; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1
-; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-NEXT: vpcmpeqb %zmm2, %zmm0, %k0
+; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm0, %zmm1, %zmm0
@@ -457,8 +456,7 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
; AVX512BW-NEXT: vpandq %zmm1, %zmm4, %zmm1
; AVX512BW-NEXT: vpshufb %zmm1, %zmm3, %zmm1
; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1
-; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-NEXT: vpcmpeqb %zmm2, %zmm0, %k0
+; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm0, %zmm1, %zmm0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
index 743fff3feee..6597925bab9 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -4591,8 +4591,7 @@ define <16 x i16> @PR34369(<16 x i16> %vec, <16 x i16> %mask) {
; AVX512VL-LABEL: PR34369:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,0,13,5,2,2,10,15,8,14,8,9,10,12,12]
-; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm1, %k1
+; AVX512VL-NEXT: vptestnmw %ymm1, %ymm1, %k1
; AVX512VL-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
; AVX512VL-NEXT: retq
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
index 7da1f2c9e08..66bd70ec81d 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
@@ -307,8 +307,7 @@ define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_
;
; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
; VL_BW_DQ: # %bb.0:
-; VL_BW_DQ-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; VL_BW_DQ-NEXT: vpcmpeqw %zmm3, %zmm0, %k0
+; VL_BW_DQ-NEXT: vptestnmw %zmm0, %zmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0
; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm3, %zmm0
@@ -364,8 +363,7 @@ define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0
;
; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8:
; VL_BW_DQ: # %bb.0:
-; VL_BW_DQ-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; VL_BW_DQ-NEXT: vpcmpeqb %ymm3, %ymm0, %k0
+; VL_BW_DQ-NEXT: vptestnmb %ymm0, %ymm0, %k0
; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0
; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm3, %zmm0
@@ -381,9 +379,8 @@ define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0
define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split(<16 x i32> %a, <16 x i32> %b, <32 x i16> %c, <32 x i16> %d) {
; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; AVX512F-NEXT: vpcmpeqd %zmm6, %zmm0, %k1
-; AVX512F-NEXT: vpcmpeqd %zmm6, %zmm1, %k2
+; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
@@ -397,9 +394,8 @@ define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_
;
; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; AVX512VL-NEXT: vpcmpeqd %zmm6, %zmm0, %k1
-; AVX512VL-NEXT: vpcmpeqd %zmm6, %zmm1, %k2
+; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vptestnmd %zmm1, %zmm1, %k2
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
@@ -413,9 +409,8 @@ define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_
;
; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
; VL_BW_DQ: # %bb.0:
-; VL_BW_DQ-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; VL_BW_DQ-NEXT: vpcmpeqd %zmm4, %zmm0, %k0
-; VL_BW_DQ-NEXT: vpcmpeqd %zmm4, %zmm1, %k1
+; VL_BW_DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0
+; VL_BW_DQ-NEXT: vptestnmd %zmm1, %zmm1, %k1
; VL_BW_DQ-NEXT: kunpckwd %k0, %k1, %k0
; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0
; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
@@ -434,9 +429,8 @@ define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_
define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split(<16 x i32> %a, <16 x i32> %b, <32 x i8> %c, <32 x i8> %d) {
; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX512F-NEXT: vpcmpeqd %zmm4, %zmm0, %k1
-; AVX512F-NEXT: vpcmpeqd %zmm4, %zmm1, %k2
+; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
@@ -450,9 +444,8 @@ define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0
;
; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT: vpcmpeqd %zmm4, %zmm0, %k1
-; AVX512VL-NEXT: vpcmpeqd %zmm4, %zmm1, %k2
+; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vptestnmd %zmm1, %zmm1, %k2
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
@@ -466,9 +459,8 @@ define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0
;
; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split:
; VL_BW_DQ: # %bb.0:
-; VL_BW_DQ-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; VL_BW_DQ-NEXT: vpcmpeqd %zmm4, %zmm0, %k0
-; VL_BW_DQ-NEXT: vpcmpeqd %zmm4, %zmm1, %k1
+; VL_BW_DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0
+; VL_BW_DQ-NEXT: vptestnmd %zmm1, %zmm1, %k1
; VL_BW_DQ-NEXT: kunpckwd %k0, %k1, %k0
; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0
; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
OpenPOWER on IntegriCloud