diff options
author | Craig Topper <craig.topper@intel.com> | 2018-01-27 20:19:09 +0000 |
---|---|---|
committer | Craig Topper <craig.topper@intel.com> | 2018-01-27 20:19:09 +0000 |
commit | 247016a735bd3264e07c6198dd3a7c419e6eeaee (patch) | |
tree | e6699e05b9802670c7aa93f9f201ed534ae4dc02 | |
parent | 513d3fa674c621d864383ffa6418878118d48791 (diff) | |
download | bcm5719-llvm-247016a735bd3264e07c6198dd3a7c419e6eeaee.tar.gz bcm5719-llvm-247016a735bd3264e07c6198dd3a7c419e6eeaee.zip |
[X86] Use vptestm/vptestnm for comparisons with zero to avoid creating a zero vector.
We can use the same input for both operands to get a free compare with zero.
We already use this trick in a couple places where we explicitly create PTESTM with the same input twice. This generalizes it.
I'm hoping to remove the ISD opcodes and move this to isel patterns like we do for scalar cmp/test.
llvm-svn: 323605
30 files changed, 2560 insertions, 4993 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 912ae31aaf3..776632551d5 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -17777,6 +17777,13 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(SSECC == 0 ? X86ISD::TESTNM : X86ISD::TESTM, dl, VT, RHS, LHS); } + + // If this is just a comparison with 0 without an AND, we can just use + // the same input twice to avoid creating a zero vector. + if (ISD::isBuildVectorAllZeros(Op1.getNode())) { + return DAG.getNode(SSECC == 0 ? X86ISD::TESTNM : X86ISD::TESTM, + dl, VT, Op0, Op0); + } } unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode) ? X86ISD::CMPMU diff --git a/llvm/test/CodeGen/X86/avx512-arith.ll b/llvm/test/CodeGen/X86/avx512-arith.ll index 766238f3280..7c8a18ad782 100644 --- a/llvm/test/CodeGen/X86/avx512-arith.ll +++ b/llvm/test/CodeGen/X86/avx512-arith.ll @@ -401,8 +401,7 @@ define <16 x i32> @vpaddd_broadcast_test(<16 x i32> %i) nounwind { define <16 x i32> @vpaddd_mask_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone { ; CHECK-LABEL: vpaddd_mask_test: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestmd %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -414,8 +413,7 @@ define <16 x i32> @vpaddd_mask_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %ma define <16 x i32> @vpaddd_maskz_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone { ; CHECK-LABEL: vpaddd_maskz_test: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestmd %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -427,8 +425,7 @@ define <16 x i32> @vpaddd_maskz_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %m define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone { ; CHECK-LABEL: vpaddd_mask_fold_test: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -441,8 +438,7 @@ define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 define <16 x i32> @vpaddd_mask_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone { ; CHECK-LABEL: vpaddd_mask_broadcast_test: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -454,8 +450,7 @@ define <16 x i32> @vpaddd_mask_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone { ; CHECK-LABEL: vpaddd_maskz_fold_test: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -468,8 +463,7 @@ define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 define <16 x i32> @vpaddd_maskz_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone { ; CHECK-LABEL: vpaddd_maskz_broadcast_test: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -671,8 +665,7 @@ entry: define <16 x float> @test_mask_vaddps(<16 x float> %dst, <16 x float> %i, ; CHECK-LABEL: test_mask_vaddps: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; CHECK-NEXT: vptestmd %zmm3, %zmm3, %k1 ; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq <16 x float> %j, <16 x i32> %mask1) @@ -686,8 +679,7 @@ define <16 x float> @test_mask_vaddps(<16 x float> %dst, <16 x float> %i, define <16 x float> @test_mask_vmulps(<16 x float> %dst, <16 x float> %i, ; CHECK-LABEL: test_mask_vmulps: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; CHECK-NEXT: vptestmd %zmm3, %zmm3, %k1 ; CHECK-NEXT: vmulps %zmm2, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq <16 x float> %j, <16 x i32> %mask1) @@ -701,8 +693,7 @@ define <16 x float> @test_mask_vmulps(<16 x float> %dst, <16 x float> %i, define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i, ; CHECK-LABEL: test_mask_vminps: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; CHECK-NEXT: vptestmd %zmm3, %zmm3, %k1 ; CHECK-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq <16 x float> %j, <16 x i32> %mask1) @@ -718,38 +709,33 @@ define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i, ; AVX512F-LABEL: test_mask_vminpd: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def %ymm3 killed %ymm3 def %zmm3 -; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512F-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1 ; AVX512F-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: test_mask_vminpd: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VL-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 +; AVX512VL-NEXT: vptestmd %ymm3, %ymm3, %k1 ; AVX512VL-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: test_mask_vminpd: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def %ymm3 killed %ymm3 def %zmm3 -; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512BW-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; AVX512BW-NEXT: vptestmd %zmm3, %zmm3, %k1 ; AVX512BW-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: test_mask_vminpd: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: # kill: def %ymm3 killed %ymm3 def %zmm3 -; AVX512DQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512DQ-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; AVX512DQ-NEXT: vptestmd %zmm3, %zmm3, %k1 ; AVX512DQ-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} ; AVX512DQ-NEXT: retq ; ; SKX-LABEL: test_mask_vminpd: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; SKX-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 +; SKX-NEXT: vptestmd %ymm3, %ymm3, %k1 ; SKX-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} ; SKX-NEXT: retq <8 x double> %j, <8 x i32> %mask1) @@ -764,8 +750,7 @@ define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i, define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i, ; CHECK-LABEL: test_mask_vmaxps: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; CHECK-NEXT: vptestmd %zmm3, %zmm3, %k1 ; CHECK-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq <16 x float> %j, <16 x i32> %mask1) @@ -781,38 +766,33 @@ define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i, ; AVX512F-LABEL: test_mask_vmaxpd: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def %ymm3 killed %ymm3 def %zmm3 -; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512F-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1 ; AVX512F-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: test_mask_vmaxpd: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VL-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 +; AVX512VL-NEXT: vptestmd %ymm3, %ymm3, %k1 ; AVX512VL-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: test_mask_vmaxpd: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def %ymm3 killed %ymm3 def %zmm3 -; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512BW-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; AVX512BW-NEXT: vptestmd %zmm3, %zmm3, %k1 ; AVX512BW-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: test_mask_vmaxpd: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: # kill: def %ymm3 killed %ymm3 def %zmm3 -; AVX512DQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512DQ-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; AVX512DQ-NEXT: vptestmd %zmm3, %zmm3, %k1 ; AVX512DQ-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} ; AVX512DQ-NEXT: retq ; ; SKX-LABEL: test_mask_vmaxpd: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; SKX-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 +; SKX-NEXT: vptestmd %ymm3, %ymm3, %k1 ; SKX-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} ; SKX-NEXT: retq <8 x double> %j, <8 x i32> %mask1) @@ -827,8 +807,7 @@ define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i, define <16 x float> @test_mask_vsubps(<16 x float> %dst, <16 x float> %i, ; CHECK-LABEL: test_mask_vsubps: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; CHECK-NEXT: vptestmd %zmm3, %zmm3, %k1 ; CHECK-NEXT: vsubps %zmm2, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq <16 x float> %j, <16 x i32> %mask1) @@ -842,8 +821,7 @@ define <16 x float> @test_mask_vsubps(<16 x float> %dst, <16 x float> %i, define <16 x float> @test_mask_vdivps(<16 x float> %dst, <16 x float> %i, ; CHECK-LABEL: test_mask_vdivps: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; CHECK-NEXT: vptestmd %zmm3, %zmm3, %k1 ; CHECK-NEXT: vdivps %zmm2, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq <16 x float> %j, <16 x i32> %mask1) @@ -857,8 +835,7 @@ define <16 x float> @test_mask_vdivps(<16 x float> %dst, <16 x float> %i, define <8 x double> @test_mask_vaddpd(<8 x double> %dst, <8 x double> %i, ; CHECK-LABEL: test_mask_vaddpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpneqq %zmm4, %zmm3, %k1 +; CHECK-NEXT: vptestmq %zmm3, %zmm3, %k1 ; CHECK-NEXT: vaddpd %zmm2, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq <8 x double> %j, <8 x i64> %mask1) @@ -872,8 +849,7 @@ define <8 x double> @test_mask_vaddpd(<8 x double> %dst, <8 x double> %i, define <8 x double> @test_maskz_vaddpd(<8 x double> %i, <8 x double> %j, ; CHECK-LABEL: test_maskz_vaddpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestmq %zmm2, %zmm2, %k1 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq <8 x i64> %mask1) nounwind readnone { @@ -886,8 +862,7 @@ define <8 x double> @test_maskz_vaddpd(<8 x double> %i, <8 x double> %j, define <8 x double> @test_mask_fold_vaddpd(<8 x double> %dst, <8 x double> %i, ; CHECK-LABEL: test_mask_fold_vaddpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestmq %zmm2, %zmm2, %k1 ; CHECK-NEXT: vaddpd (%rdi), %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq <8 x double>* %j, <8 x i64> %mask1) @@ -902,8 +877,7 @@ define <8 x double> @test_mask_fold_vaddpd(<8 x double> %dst, <8 x double> %i, define <8 x double> @test_maskz_fold_vaddpd(<8 x double> %i, <8 x double>* %j, ; CHECK-LABEL: test_maskz_fold_vaddpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vaddpd (%rdi), %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq <8 x i64> %mask1) nounwind { @@ -930,8 +904,7 @@ define <8 x double> @test_broadcast_vaddpd(<8 x double> %i, double* %j) nounwind define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double> %i, ; CHECK-LABEL: test_mask_broadcast_vaddpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpcmpneqq %zmm0, %zmm2, %k1 +; CHECK-NEXT: vptestmq %zmm2, %zmm2, %k1 ; CHECK-NEXT: vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1} ; CHECK-NEXT: vmovapd %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -949,8 +922,7 @@ define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double> define <8 x double> @test_maskz_broadcast_vaddpd(<8 x double> %i, double* %j, ; CHECK-LABEL: test_maskz_broadcast_vaddpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vaddpd (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq <8 x i64> %mask1) nounwind { diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll index c5efcb4f358..f4425b7583d 100644 --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -2413,8 +2413,7 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) { ; KNL-LABEL: test_bitcast_v8i1_zext: ; KNL: ## %bb.0: -; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; KNL-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movzbl %al, %eax ; KNL-NEXT: addl %eax, %eax @@ -2423,8 +2422,7 @@ define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) { ; ; SKX-LABEL: test_bitcast_v8i1_zext: ; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; SKX-NEXT: kmovb %k0, %eax ; SKX-NEXT: addl %eax, %eax ; SKX-NEXT: vzeroupper @@ -2432,8 +2430,7 @@ define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) { ; ; AVX512BW-LABEL: test_bitcast_v8i1_zext: ; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: addl %eax, %eax @@ -2442,8 +2439,7 @@ define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) { ; ; AVX512DQ-LABEL: test_bitcast_v8i1_zext: ; AVX512DQ: ## %bb.0: -; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; AVX512DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512DQ-NEXT: kmovb %k0, %eax ; AVX512DQ-NEXT: addl %eax, %eax ; AVX512DQ-NEXT: vzeroupper @@ -2459,8 +2455,7 @@ define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) { define i32 @test_bitcast_v16i1_zext(<16 x i32> %a) { ; CHECK-LABEL: test_bitcast_v16i1_zext: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: addl %eax, %eax ; CHECK-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll b/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll index 083427c76da..275884c6de0 100644 --- a/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll +++ b/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll @@ -5,8 +5,7 @@ define <16 x i32> @test1(<16 x i32> %trigger, <16 x i32>* %addr) { ; AVX512-LABEL: test1: ; AVX512: ## %bb.0: -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} ; AVX512-NEXT: retq %mask = icmp eq <16 x i32> %trigger, zeroinitializer @@ -17,8 +16,7 @@ define <16 x i32> @test1(<16 x i32> %trigger, <16 x i32>* %addr) { define <16 x i32> @test2(<16 x i32> %trigger, <16 x i32>* %addr) { ; AVX512-LABEL: test2: ; AVX512: ## %bb.0: -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} ; AVX512-NEXT: retq %mask = icmp eq <16 x i32> %trigger, zeroinitializer @@ -29,8 +27,7 @@ define <16 x i32> @test2(<16 x i32> %trigger, <16 x i32>* %addr) { define void @test3(<16 x i32> %trigger, <16 x i32>* %addr, <16 x i32> %val) { ; AVX512-LABEL: test3: ; AVX512: ## %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 +; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -42,8 +39,7 @@ define void @test3(<16 x i32> %trigger, <16 x i32>* %addr, <16 x i32> %val) { define <16 x float> @test4(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %dst) { ; AVX512-LABEL: test4: ; AVX512: ## %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 +; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1} ; AVX512-NEXT: retq %mask = icmp eq <16 x i32> %trigger, zeroinitializer @@ -54,8 +50,7 @@ define <16 x float> @test4(<16 x i32> %trigger, <16 x float>* %addr, <16 x float define void @test13(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %val) { ; AVX512-LABEL: test13: ; AVX512: ## %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 +; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512-NEXT: vmovups %zmm1, (%rdi) {%k1} ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -99,9 +94,8 @@ declare <16 x i32*> @llvm.masked.load.v16p0i32.p0v16p0i32(<16 x i32*>*, i32, <16 define <16 x i32*> @test23(<16 x i32*> %trigger, <16 x i32*>* %addr) { ; AVX512-LABEL: test23: ; AVX512: ## %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 -; AVX512-NEXT: vpcmpeqq %zmm2, %zmm1, %k2 +; AVX512-NEXT: vptestnmq %zmm0, %zmm0, %k1 +; AVX512-NEXT: vptestnmq %zmm1, %zmm1, %k2 ; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k2} {z} ; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-mov.ll b/llvm/test/CodeGen/X86/avx512-mov.ll index f1a2ac880ed..f7ab9e24d72 100644 --- a/llvm/test/CodeGen/X86/avx512-mov.ll +++ b/llvm/test/CodeGen/X86/avx512-mov.ll @@ -311,8 +311,7 @@ define <16 x float> @test31(i8 * %addr) { define <16 x i32> @test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { ; CHECK-LABEL: test32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x48,0x1f,0xca,0x04] +; CHECK-NEXT: vptestmd %zmm1, %zmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x48,0x27,0xc9] ; CHECK-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -325,8 +324,7 @@ define <16 x i32> @test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { define <16 x i32> @test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { ; CHECK-LABEL: test33: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x48,0x1f,0xca,0x04] +; CHECK-NEXT: vptestmd %zmm1, %zmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x48,0x27,0xc9] ; CHECK-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -339,8 +337,7 @@ define <16 x i32> @test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { define <16 x i32> @test34(i8 * %addr, <16 x i32> %mask1) { ; CHECK-LABEL: test34: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0xef,0xc9] -; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc9,0x04] +; CHECK-NEXT: vptestmd %zmm0, %zmm0, %k1 ## encoding: [0x62,0xf2,0x7d,0x48,0x27,0xc8] ; CHECK-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -353,8 +350,7 @@ define <16 x i32> @test34(i8 * %addr, <16 x i32> %mask1) { define <16 x i32> @test35(i8 * %addr, <16 x i32> %mask1) { ; CHECK-LABEL: test35: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0xef,0xc9] -; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc9,0x04] +; CHECK-NEXT: vptestmd %zmm0, %zmm0, %k1 ## encoding: [0x62,0xf2,0x7d,0x48,0x27,0xc8] ; CHECK-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xc9,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -367,8 +363,7 @@ define <16 x i32> @test35(i8 * %addr, <16 x i32> %mask1) { define <8 x i64> @test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) { ; CHECK-LABEL: test36: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x48,0x1f,0xca,0x04] +; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x48,0x27,0xc9] ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <8 x i64> %mask1, zeroinitializer @@ -381,8 +376,7 @@ define <8 x i64> @test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) { define <8 x i64> @test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) { ; CHECK-LABEL: test37: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x48,0x1f,0xca,0x04] +; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x48,0x27,0xc9] ; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x49,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <8 x i64> %mask1, zeroinitializer @@ -395,8 +389,7 @@ define <8 x i64> @test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) { define <8 x i64> @test38(i8 * %addr, <8 x i64> %mask1) { ; CHECK-LABEL: test38: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0xef,0xc9] -; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xc9,0x04] +; CHECK-NEXT: vptestmq %zmm0, %zmm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc8] ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <8 x i64> %mask1, zeroinitializer @@ -409,8 +402,7 @@ define <8 x i64> @test38(i8 * %addr, <8 x i64> %mask1) { define <8 x i64> @test39(i8 * %addr, <8 x i64> %mask1) { ; CHECK-LABEL: test39: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0xef,0xc9] -; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xc9,0x04] +; CHECK-NEXT: vptestmq %zmm0, %zmm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc8] ; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfe,0xc9,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <8 x i64> %mask1, zeroinitializer diff --git a/llvm/test/CodeGen/X86/avx512-schedule.ll b/llvm/test/CodeGen/X86/avx512-schedule.ll index a7969251940..1f9fa5204d1 100755 --- a/llvm/test/CodeGen/X86/avx512-schedule.ll +++ b/llvm/test/CodeGen/X86/avx512-schedule.ll @@ -400,15 +400,13 @@ define <16 x i32> @vpaddd_broadcast_test(<16 x i32> %i) nounwind { define <16 x i32> @vpaddd_mask_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone { ; GENERIC-LABEL: vpaddd_mask_test: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpaddd_mask_test: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -420,15 +418,13 @@ define <16 x i32> @vpaddd_mask_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %ma define <16 x i32> @vpaddd_maskz_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone { ; GENERIC-LABEL: vpaddd_maskz_test: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpaddd_maskz_test: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -440,15 +436,13 @@ define <16 x i32> @vpaddd_maskz_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %m define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone { ; GENERIC-LABEL: vpaddd_mask_fold_test: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpaddd_mask_fold_test: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -461,15 +455,13 @@ define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 define <16 x i32> @vpaddd_mask_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone { ; GENERIC-LABEL: vpaddd_mask_broadcast_test: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpaddd_mask_broadcast_test: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -481,15 +473,13 @@ define <16 x i32> @vpaddd_mask_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone { ; GENERIC-LABEL: vpaddd_maskz_fold_test: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpaddd_maskz_fold_test: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -502,15 +492,13 @@ define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 define <16 x i32> @vpaddd_maskz_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone { ; GENERIC-LABEL: vpaddd_maskz_broadcast_test: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} {z} # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpaddd_maskz_broadcast_test: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} {z} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -717,15 +705,13 @@ entry: define <16 x float> @test_mask_vaddps(<16 x float> %dst, <16 x float> %i, ; GENERIC-LABEL: test_mask_vaddps: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmd %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vaddps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mask_vaddps: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmd %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vaddps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] <16 x float> %j, <16 x i32> %mask1) @@ -739,15 +725,13 @@ define <16 x float> @test_mask_vaddps(<16 x float> %dst, <16 x float> %i, define <16 x float> @test_mask_vmulps(<16 x float> %dst, <16 x float> %i, <16 x float> %j, <16 x i32> %mask1) nounwind readnone { ; GENERIC-LABEL: test_mask_vmulps: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmd %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vmulps %zmm2, %zmm1, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mask_vmulps: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmd %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vmulps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -759,15 +743,13 @@ define <16 x float> @test_mask_vmulps(<16 x float> %dst, <16 x float> %i, <16 x define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i, <16 x float> %j, <16 x i32> %mask1) nounwind readnone { ; GENERIC-LABEL: test_mask_vminps: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmd %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mask_vminps: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmd %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -780,15 +762,13 @@ define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i, <16 x define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i, <8 x double> %j, <8 x i32> %mask1) nounwind readnone { ; GENERIC-LABEL: test_mask_vminpd: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmd %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mask_vminpd: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmd %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i32> %mask1, zeroinitializer @@ -801,15 +781,13 @@ define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i, <8 x d define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i, <16 x float> %j, <16 x i32> %mask1) nounwind readnone { ; GENERIC-LABEL: test_mask_vmaxps: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmd %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mask_vmaxps: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmd %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -822,15 +800,13 @@ define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i, <16 x define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i, <8 x double> %j, <8 x i32> %mask1) nounwind readnone { ; GENERIC-LABEL: test_mask_vmaxpd: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmd %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mask_vmaxpd: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmd %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i32> %mask1, zeroinitializer @@ -843,15 +819,13 @@ define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i, <8 x d define <16 x float> @test_mask_vsubps(<16 x float> %dst, <16 x float> %i, <16 x float> %j, <16 x i32> %mask1) nounwind readnone { ; GENERIC-LABEL: test_mask_vsubps: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmd %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vsubps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mask_vsubps: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmd %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vsubps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -863,15 +837,13 @@ define <16 x float> @test_mask_vsubps(<16 x float> %dst, <16 x float> %i, <16 x define <16 x float> @test_mask_vdivps(<16 x float> %dst, <16 x float> %i, <16 x float> %j, <16 x i32> %mask1) nounwind readnone { ; GENERIC-LABEL: test_mask_vdivps: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmd %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vdivps %zmm2, %zmm1, %zmm0 {%k1} # sched: [24:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mask_vdivps: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmd %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vdivps %zmm2, %zmm1, %zmm0 {%k1} # sched: [23:2.00] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -883,15 +855,13 @@ define <16 x float> @test_mask_vdivps(<16 x float> %dst, <16 x float> %i, <16 x define <8 x double> @test_mask_vaddpd(<8 x double> %dst, <8 x double> %i, <8 x double> %j, <8 x i64> %mask1) nounwind readnone { ; GENERIC-LABEL: test_mask_vaddpd: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmq %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vaddpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mask_vaddpd: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmq %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vaddpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i64> %mask1, zeroinitializer @@ -903,15 +873,13 @@ define <8 x double> @test_mask_vaddpd(<8 x double> %dst, <8 x double> %i, <8 x d define <8 x double> @test_maskz_vaddpd(<8 x double> %i, <8 x double> %j, <8 x i64> %mask1) nounwind readnone { ; GENERIC-LABEL: test_maskz_vaddpd: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vaddpd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_maskz_vaddpd: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vaddpd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i64> %mask1, zeroinitializer @@ -923,15 +891,13 @@ define <8 x double> @test_maskz_vaddpd(<8 x double> %i, <8 x double> %j, <8 x i6 define <8 x double> @test_mask_fold_vaddpd(<8 x double> %dst, <8 x double> %i, <8 x double>* %j, <8 x i64> %mask1) nounwind { ; GENERIC-LABEL: test_mask_fold_vaddpd: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vaddpd (%rdi), %zmm1, %zmm0 {%k1} # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mask_fold_vaddpd: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vaddpd (%rdi), %zmm1, %zmm0 {%k1} # sched: [11:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i64> %mask1, zeroinitializer @@ -944,15 +910,13 @@ define <8 x double> @test_mask_fold_vaddpd(<8 x double> %dst, <8 x double> %i, < define <8 x double> @test_maskz_fold_vaddpd(<8 x double> %i, <8 x double>* %j, <8 x i64> %mask1) nounwind { ; GENERIC-LABEL: test_maskz_fold_vaddpd: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vaddpd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_maskz_fold_vaddpd: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vaddpd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [11:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i64> %mask1, zeroinitializer @@ -983,16 +947,14 @@ define <8 x double> @test_broadcast_vaddpd(<8 x double> %i, double* %j) nounwind define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double> %i, double* %j, <8 x i64> %mask1) nounwind { ; GENERIC-LABEL: test_mask_broadcast_vaddpd: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm0, %xmm0, %xmm0 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqq %zmm0, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1} # sched: [7:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mask_broadcast_vaddpd: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqq %zmm0, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1} # sched: [11:0.50] ; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -1009,15 +971,13 @@ define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double> define <8 x double> @test_maskz_broadcast_vaddpd(<8 x double> %i, double* %j, ; GENERIC-LABEL: test_maskz_broadcast_vaddpd: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vaddpd (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_maskz_broadcast_vaddpd: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vaddpd (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} # sched: [11:0.50] ; SKX-NEXT: retq # sched: [7:1.00] <8 x i64> %mask1) nounwind { @@ -6383,15 +6343,13 @@ define <16 x float> @mov_test31(i8 * %addr) { define <16 x i32> @mov_test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { ; GENERIC-LABEL: mov_test32: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test32: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -6404,15 +6362,13 @@ define <16 x i32> @mov_test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { define <16 x i32> @mov_test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { ; GENERIC-LABEL: mov_test33: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test33: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -6425,15 +6381,13 @@ define <16 x i32> @mov_test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { define <16 x i32> @mov_test34(i8 * %addr, <16 x i32> %mask1) { ; GENERIC-LABEL: mov_test34: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmd %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test34: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmd %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -6446,15 +6400,13 @@ define <16 x i32> @mov_test34(i8 * %addr, <16 x i32> %mask1) { define <16 x i32> @mov_test35(i8 * %addr, <16 x i32> %mask1) { ; GENERIC-LABEL: mov_test35: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmd %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test35: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmd %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -6467,15 +6419,13 @@ define <16 x i32> @mov_test35(i8 * %addr, <16 x i32> %mask1) { define <8 x i64> @mov_test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) { ; GENERIC-LABEL: mov_test36: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test36: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i64> %mask1, zeroinitializer @@ -6488,15 +6438,13 @@ define <8 x i64> @mov_test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) { define <8 x i64> @mov_test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) { ; GENERIC-LABEL: mov_test37: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test37: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i64> %mask1, zeroinitializer @@ -6509,15 +6457,13 @@ define <8 x i64> @mov_test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) { define <8 x i64> @mov_test38(i8 * %addr, <8 x i64> %mask1) { ; GENERIC-LABEL: mov_test38: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmq %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test38: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmq %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i64> %mask1, zeroinitializer @@ -6530,15 +6476,13 @@ define <8 x i64> @mov_test38(i8 * %addr, <8 x i64> %mask1) { define <8 x i64> @mov_test39(i8 * %addr, <8 x i64> %mask1) { ; GENERIC-LABEL: mov_test39: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmq %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test39: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmq %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i64> %mask1, zeroinitializer @@ -8032,8 +7976,7 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) { ; GENERIC-LABEL: test_bitcast_v8i1_zext: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovb %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: addl %eax, %eax # sched: [1:0.33] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] @@ -8041,8 +7984,7 @@ define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) { ; ; SKX-LABEL: test_bitcast_v8i1_zext: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k0 # sched: [3:1.00] ; SKX-NEXT: kmovb %k0, %eax # sched: [3:1.00] ; SKX-NEXT: addl %eax, %eax # sched: [1:0.25] ; SKX-NEXT: vzeroupper # sched: [4:1.00] @@ -8058,8 +8000,7 @@ define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) { define i32 @test_bitcast_v16i1_zext(<16 x i32> %a) { ; GENERIC-LABEL: test_bitcast_v16i1_zext: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovw %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: addl %eax, %eax # sched: [1:0.33] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] @@ -8067,8 +8008,7 @@ define i32 @test_bitcast_v16i1_zext(<16 x i32> %a) { ; ; SKX-LABEL: test_bitcast_v16i1_zext: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k0 # sched: [3:1.00] ; SKX-NEXT: kmovw %k0, %eax # sched: [3:1.00] ; SKX-NEXT: addl %eax, %eax # sched: [1:0.25] ; SKX-NEXT: vzeroupper # sched: [4:1.00] @@ -8292,16 +8232,14 @@ define <16 x float> @_inreg16xfloat(float %a) { define <16 x float> @_ss16xfloat_mask(float %a, <16 x float> %i, <16 x i32> %mask1) { ; GENERIC-LABEL: _ss16xfloat_mask: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vbroadcastss %xmm0, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _ss16xfloat_mask: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vbroadcastss %xmm0, %zmm1 {%k1} # sched: [3:1.00] ; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -8315,15 +8253,13 @@ define <16 x float> @_ss16xfloat_mask(float %a, <16 x float> %i, <16 x i32> %m define <16 x float> @_ss16xfloat_maskz(float %a, <16 x i32> %mask1) { ; GENERIC-LABEL: _ss16xfloat_maskz: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _ss16xfloat_maskz: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -8352,15 +8288,13 @@ define <16 x float> @_ss16xfloat_load(float* %a.ptr) { define <16 x float> @_ss16xfloat_mask_load(float* %a.ptr, <16 x float> %i, <16 x i32> %mask1) { ; GENERIC-LABEL: _ss16xfloat_mask_load: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _ss16xfloat_mask_load: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %a = load float, float* %a.ptr @@ -8374,15 +8308,13 @@ define <16 x float> @_ss16xfloat_mask_load(float* %a.ptr, <16 x float> %i, <16 define <16 x float> @_ss16xfloat_maskz_load(float* %a.ptr, <16 x i32> %mask1) { ; GENERIC-LABEL: _ss16xfloat_maskz_load: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmd %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _ss16xfloat_maskz_load: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmd %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %a = load float, float* %a.ptr @@ -8411,16 +8343,14 @@ define <8 x double> @_inreg8xdouble(double %a) { define <8 x double> @_sd8xdouble_mask(double %a, <8 x double> %i, <8 x i32> %mask1) { ; GENERIC-LABEL: _sd8xdouble_mask: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _sd8xdouble_mask: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1} # sched: [3:1.00] ; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -8434,15 +8364,13 @@ define <8 x double> @_sd8xdouble_mask(double %a, <8 x double> %i, <8 x i32> %m define <8 x double> @_sd8xdouble_maskz(double %a, <8 x i32> %mask1) { ; GENERIC-LABEL: _sd8xdouble_maskz: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _sd8xdouble_maskz: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i32> %mask1, zeroinitializer @@ -8471,15 +8399,13 @@ define <8 x double> @_sd8xdouble_load(double* %a.ptr) { define <8 x double> @_sd8xdouble_mask_load(double* %a.ptr, <8 x double> %i, <8 x i32> %mask1) { ; GENERIC-LABEL: _sd8xdouble_mask_load: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _sd8xdouble_mask_load: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %a = load double, double* %a.ptr @@ -8493,15 +8419,13 @@ define <8 x double> @_sd8xdouble_mask_load(double* %a.ptr, <8 x double> %i, <8 define <8 x double> @_sd8xdouble_maskz_load(double* %a.ptr, <8 x i32> %mask1) { ; GENERIC-LABEL: _sd8xdouble_maskz_load: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %ymm1, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestmd %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _sd8xdouble_maskz_load: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpneqd %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestmd %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %a = load double, double* %a.ptr diff --git a/llvm/test/CodeGen/X86/avx512-shuffle-schedule.ll b/llvm/test/CodeGen/X86/avx512-shuffle-schedule.ll index 6e4bd992718..544d7e2f1a8 100755 --- a/llvm/test/CodeGen/X86/avx512-shuffle-schedule.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffle-schedule.ll @@ -23,8 +23,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %ve ; GENERIC-LABEL: test_masked_16xi16_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -32,8 +31,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %ve ; SKX-LABEL: test_masked_16xi16_perm_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -47,16 +45,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> % ; GENERIC-LABEL: test_masked_z_16xi16_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14> @@ -68,8 +64,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %ve ; GENERIC-LABEL: test_masked_16xi16_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -77,8 +72,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %ve ; SKX-LABEL: test_masked_16xi16_perm_mask1: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -92,16 +86,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> % ; GENERIC-LABEL: test_masked_z_16xi16_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_mask1: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0> @@ -113,8 +105,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %ve ; GENERIC-LABEL: test_masked_16xi16_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -122,8 +113,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %ve ; SKX-LABEL: test_masked_16xi16_perm_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -137,16 +127,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> % ; GENERIC-LABEL: test_masked_z_16xi16_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7> @@ -173,8 +161,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %ve ; GENERIC-LABEL: test_masked_16xi16_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -182,8 +169,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %ve ; SKX-LABEL: test_masked_16xi16_perm_mask3: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -197,16 +183,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> % ; GENERIC-LABEL: test_masked_z_16xi16_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_mask3: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6> @@ -234,16 +218,14 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i16> ; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_mem_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp @@ -257,16 +239,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i1 ; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp @@ -280,16 +260,14 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i16> ; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_mem_mask1: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp @@ -303,16 +281,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i1 ; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask1: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp @@ -326,16 +302,14 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i16> ; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_mem_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp @@ -349,16 +323,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i1 ; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp @@ -388,16 +360,14 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i16> ; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_mem_mask3: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp @@ -411,16 +381,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i1 ; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask3: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp @@ -449,8 +417,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %ve ; GENERIC-LABEL: test_masked_32xi16_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -458,8 +425,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %ve ; SKX-LABEL: test_masked_32xi16_perm_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -473,16 +439,14 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> % ; GENERIC-LABEL: test_masked_z_32xi16_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10> @@ -494,8 +458,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %ve ; GENERIC-LABEL: test_masked_32xi16_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -503,8 +466,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %ve ; SKX-LABEL: test_masked_32xi16_perm_mask1: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -518,16 +480,14 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> % ; GENERIC-LABEL: test_masked_z_32xi16_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_mask1: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 8, i32 7, i32 30, i32 11, i32 9, i32 11, i32 30, i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12, i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16> @@ -539,8 +499,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %ve ; GENERIC-LABEL: test_masked_32xi16_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -548,8 +507,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %ve ; SKX-LABEL: test_masked_32xi16_perm_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -563,16 +521,14 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> % ; GENERIC-LABEL: test_masked_z_32xi16_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25, i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16, i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27> @@ -599,8 +555,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %ve ; GENERIC-LABEL: test_masked_32xi16_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -608,8 +563,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %ve ; SKX-LABEL: test_masked_32xi16_perm_mask3: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -623,16 +577,14 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> % ; GENERIC-LABEL: test_masked_z_32xi16_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_mask3: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4> @@ -660,16 +612,14 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16> ; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_mem_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp @@ -683,16 +633,14 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i1 ; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp @@ -706,16 +654,14 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16> ; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_mem_mask1: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp @@ -729,16 +675,14 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i1 ; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask1: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp @@ -752,16 +696,14 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16> ; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_mem_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp @@ -775,16 +717,14 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i1 ; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp @@ -814,16 +754,14 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16> ; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_mem_mask3: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp @@ -837,16 +775,14 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i1 ; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask3: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp @@ -875,8 +811,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, ; GENERIC-LABEL: test_masked_8xi32_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -884,8 +819,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, ; SKX-LABEL: test_masked_8xi32_perm_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -899,16 +833,14 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask ; GENERIC-LABEL: test_masked_z_8xi32_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi32_perm_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6> @@ -920,8 +852,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, ; GENERIC-LABEL: test_masked_8xi32_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -929,8 +860,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, ; SKX-LABEL: test_masked_8xi32_perm_mask1: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -944,16 +874,14 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask ; GENERIC-LABEL: test_masked_z_8xi32_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi32_perm_mask1: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 5, i32 1, i32 2, i32 6, i32 0, i32 0, i32 3> @@ -965,8 +893,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, ; GENERIC-LABEL: test_masked_8xi32_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -974,8 +901,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, ; SKX-LABEL: test_masked_8xi32_perm_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -989,16 +915,14 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask ; GENERIC-LABEL: test_masked_z_8xi32_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi32_perm_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 6, i32 5, i32 5, i32 1, i32 7, i32 3, i32 4> @@ -1025,8 +949,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, ; GENERIC-LABEL: test_masked_8xi32_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -1034,8 +957,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, ; SKX-LABEL: test_masked_8xi32_perm_mask3: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -1049,16 +971,14 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask ; GENERIC-LABEL: test_masked_z_8xi32_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi32_perm_mask3: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0> @@ -1086,16 +1006,14 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %ve ; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi32_perm_mem_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp @@ -1109,16 +1027,14 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> % ; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp @@ -1132,16 +1048,14 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %ve ; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi32_perm_mem_mask1: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp @@ -1155,16 +1069,14 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> % ; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask1: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp @@ -1178,16 +1090,14 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %ve ; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi32_perm_mem_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp @@ -1201,16 +1111,14 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> % ; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp @@ -1240,16 +1148,14 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %ve ; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi32_perm_mem_mask3: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp @@ -1263,16 +1169,14 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> % ; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask3: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp @@ -1301,8 +1205,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %ve ; GENERIC-LABEL: test_masked_16xi32_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -1310,8 +1213,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %ve ; SKX-LABEL: test_masked_16xi32_perm_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -1325,16 +1227,14 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> % ; GENERIC-LABEL: test_masked_z_16xi32_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi32_perm_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7> @@ -1346,8 +1246,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %ve ; GENERIC-LABEL: test_masked_16xi32_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -1355,8 +1254,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %ve ; SKX-LABEL: test_masked_16xi32_perm_mask1: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -1370,16 +1268,14 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> % ; GENERIC-LABEL: test_masked_z_16xi32_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi32_perm_mask1: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 10, i32 0, i32 14, i32 15, i32 11, i32 1, i32 1, i32 5, i32 0, i32 5, i32 0, i32 15, i32 13, i32 1, i32 14, i32 3> @@ -1391,8 +1287,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %ve ; GENERIC-LABEL: test_masked_16xi32_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -1400,8 +1295,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %ve ; SKX-LABEL: test_masked_16xi32_perm_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -1415,16 +1309,14 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> % ; GENERIC-LABEL: test_masked_z_16xi32_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi32_perm_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 10, i32 15, i32 1, i32 0, i32 5, i32 0, i32 9, i32 13, i32 2, i32 1, i32 5, i32 15, i32 2, i32 15, i32 5> @@ -1451,8 +1343,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %ve ; GENERIC-LABEL: test_masked_16xi32_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -1460,8 +1351,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %ve ; SKX-LABEL: test_masked_16xi32_perm_mask3: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -1475,16 +1365,14 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> % ; GENERIC-LABEL: test_masked_z_16xi32_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi32_perm_mask3: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12> @@ -1512,16 +1400,14 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> ; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi32_perm_mem_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp @@ -1535,16 +1421,14 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i3 ; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp @@ -1558,16 +1442,14 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> ; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi32_perm_mem_mask1: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp @@ -1581,16 +1463,14 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i3 ; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask1: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp @@ -1604,16 +1484,14 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> ; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi32_perm_mem_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp @@ -1627,16 +1505,14 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i3 ; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp @@ -1666,16 +1542,14 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> ; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi32_perm_mem_mask3: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp @@ -1689,16 +1563,14 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i3 ; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask3: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp @@ -1724,16 +1596,14 @@ define <4 x i64> @test_4xi64_perm_mask0(<4 x i64> %vec) { define <4 x i64> @test_masked_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_masked_4xi64_perm_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xi64_perm_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1] sched: [3:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -1746,15 +1616,13 @@ define <4 x i64> @test_masked_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %vec2, define <4 x i64> @test_masked_z_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_4xi64_perm_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi64_perm_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1> @@ -1765,16 +1633,14 @@ define <4 x i64> @test_masked_z_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %mask define <4 x i64> @test_masked_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_masked_4xi64_perm_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xi64_perm_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3] sched: [3:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -1787,15 +1653,13 @@ define <4 x i64> @test_masked_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %vec2, define <4 x i64> @test_masked_z_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_4xi64_perm_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi64_perm_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3> @@ -1806,16 +1670,14 @@ define <4 x i64> @test_masked_z_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %mask define <4 x i64> @test_masked_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_masked_4xi64_perm_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xi64_perm_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1] sched: [3:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -1828,15 +1690,13 @@ define <4 x i64> @test_masked_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %vec2, define <4 x i64> @test_masked_z_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_4xi64_perm_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi64_perm_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 1> @@ -1860,16 +1720,14 @@ define <4 x i64> @test_4xi64_perm_mask3(<4 x i64> %vec) { define <4 x i64> @test_masked_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_masked_4xi64_perm_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xi64_perm_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3] sched: [3:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -1882,15 +1740,13 @@ define <4 x i64> @test_masked_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %vec2, define <4 x i64> @test_masked_z_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_4xi64_perm_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi64_perm_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3> @@ -1915,15 +1771,13 @@ define <4 x i64> @test_4xi64_perm_mem_mask0(<4 x i64>* %vp) { define <4 x i64> @test_masked_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xi64_perm_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i64>, <4 x i64>* %vp @@ -1936,15 +1790,13 @@ define <4 x i64> @test_masked_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %ve define <4 x i64> @test_masked_z_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i64>, <4 x i64>* %vp @@ -1957,15 +1809,13 @@ define <4 x i64> @test_masked_z_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> % define <4 x i64> @test_masked_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xi64_perm_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i64>, <4 x i64>* %vp @@ -1978,15 +1828,13 @@ define <4 x i64> @test_masked_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %ve define <4 x i64> @test_masked_z_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i64>, <4 x i64>* %vp @@ -1999,15 +1847,13 @@ define <4 x i64> @test_masked_z_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> % define <4 x i64> @test_masked_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xi64_perm_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i64>, <4 x i64>* %vp @@ -2020,15 +1866,13 @@ define <4 x i64> @test_masked_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %ve define <4 x i64> @test_masked_z_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i64>, <4 x i64>* %vp @@ -2055,15 +1899,13 @@ define <4 x i64> @test_4xi64_perm_mem_mask3(<4 x i64>* %vp) { define <4 x i64> @test_masked_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xi64_perm_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i64>, <4 x i64>* %vp @@ -2076,15 +1918,13 @@ define <4 x i64> @test_masked_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %ve define <4 x i64> @test_masked_z_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i64>, <4 x i64>* %vp @@ -2113,8 +1953,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %vec2, ; GENERIC-LABEL: test_masked_8xi64_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2122,8 +1961,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %vec2, ; SKX-LABEL: test_masked_8xi64_perm_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -2137,16 +1975,14 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %mask ; GENERIC-LABEL: test_masked_z_8xi64_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6> @@ -2157,16 +1993,14 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %mask define <8 x i64> @test_masked_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_imm_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5] sched: [3:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -2179,15 +2013,13 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %ve define <8 x i64> @test_masked_z_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 1, i32 5, i32 4, i32 5, i32 5> @@ -2199,8 +2031,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %vec2, ; GENERIC-LABEL: test_masked_8xi64_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2208,8 +2039,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %vec2, ; SKX-LABEL: test_masked_8xi64_perm_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -2223,16 +2053,14 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %mask ; GENERIC-LABEL: test_masked_z_8xi64_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 3, i32 3, i32 5, i32 4, i32 1> @@ -2256,16 +2084,14 @@ define <8 x i64> @test_8xi64_perm_imm_mask3(<8 x i64> %vec) { define <8 x i64> @test_masked_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_imm_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -2278,15 +2104,13 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %ve define <8 x i64> @test_masked_z_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5> @@ -2298,8 +2122,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %vec2, ; GENERIC-LABEL: test_masked_8xi64_perm_mask4: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2307,8 +2130,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %vec2, ; SKX-LABEL: test_masked_8xi64_perm_mask4: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -2322,16 +2144,14 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %mask ; GENERIC-LABEL: test_masked_z_8xi64_perm_mask4: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_mask4: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 6, i32 3, i32 1, i32 1, i32 7, i32 4, i32 0, i32 3> @@ -2342,16 +2162,14 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %mask define <8 x i64> @test_masked_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask5: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_imm_mask5: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4] sched: [3:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -2364,15 +2182,13 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %ve define <8 x i64> @test_masked_z_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask5: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask5: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> @@ -2399,8 +2215,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %vec2, ; GENERIC-LABEL: test_masked_8xi64_perm_mask6: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2408,8 +2223,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %vec2, ; SKX-LABEL: test_masked_8xi64_perm_mask6: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -2423,16 +2237,14 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %mask ; GENERIC-LABEL: test_masked_z_8xi64_perm_mask6: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_mask6: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7> @@ -2443,16 +2255,14 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %mask define <8 x i64> @test_masked_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask7: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_imm_mask7: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7] sched: [3:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -2465,15 +2275,13 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %ve define <8 x i64> @test_masked_z_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask7: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask7: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7> @@ -2501,16 +2309,14 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %ve ; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_mem_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp @@ -2524,16 +2330,14 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> % ; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp @@ -2546,15 +2350,13 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> % define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp @@ -2567,15 +2369,13 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp @@ -2589,16 +2389,14 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %ve ; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_mem_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp @@ -2612,16 +2410,14 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> % ; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp @@ -2648,15 +2444,13 @@ define <8 x i64> @test_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp) { define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp @@ -2669,15 +2463,13 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp @@ -2691,16 +2483,14 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %ve ; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask4: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_mem_mask4: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp @@ -2714,16 +2504,14 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> % ; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask4: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask4: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp @@ -2736,15 +2524,13 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> % define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask5: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask5: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp @@ -2757,15 +2543,13 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp @@ -2795,16 +2579,14 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %ve ; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask6: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_mem_mask6: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp @@ -2818,16 +2600,14 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> % ; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask6: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask6: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp @@ -2840,15 +2620,13 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> % define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask7: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask7: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp @@ -2861,15 +2639,13 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp @@ -2898,8 +2674,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask0(<8 x float> %vec, <8 x float> ; GENERIC-LABEL: test_masked_8xfloat_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2907,8 +2682,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask0(<8 x float> %vec, <8 x float> ; SKX-LABEL: test_masked_8xfloat_perm_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00] ; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -2922,16 +2696,14 @@ define <8 x float> @test_masked_z_8xfloat_perm_mask0(<8 x float> %vec, <8 x i32> ; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xfloat_perm_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4> @@ -2943,8 +2715,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask1(<8 x float> %vec, <8 x float> ; GENERIC-LABEL: test_masked_8xfloat_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2952,8 +2723,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask1(<8 x float> %vec, <8 x float> ; SKX-LABEL: test_masked_8xfloat_perm_mask1: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00] ; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -2967,16 +2737,14 @@ define <8 x float> @test_masked_z_8xfloat_perm_mask1(<8 x float> %vec, <8 x i64> ; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xfloat_perm_mask1: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 2, i32 1, i32 0, i32 6, i32 0, i32 5, i32 1> @@ -2988,8 +2756,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask2(<8 x float> %vec, <8 x float> ; GENERIC-LABEL: test_masked_8xfloat_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2997,8 +2764,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask2(<8 x float> %vec, <8 x float> ; SKX-LABEL: test_masked_8xfloat_perm_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00] ; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -3012,16 +2778,14 @@ define <8 x float> @test_masked_z_8xfloat_perm_mask2(<8 x float> %vec, <8 x i32> ; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xfloat_perm_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 5, i32 5, i32 5, i32 4, i32 6, i32 0, i32 5> @@ -3048,8 +2812,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask3(<8 x float> %vec, <8 x float> ; GENERIC-LABEL: test_masked_8xfloat_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3057,8 +2820,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask3(<8 x float> %vec, <8 x float> ; SKX-LABEL: test_masked_8xfloat_perm_mask3: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00] ; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -3072,16 +2834,14 @@ define <8 x float> @test_masked_z_8xfloat_perm_mask3(<8 x float> %vec, <8 x i32> ; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xfloat_perm_mask3: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6> @@ -3109,16 +2869,14 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x fl ; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xfloat_perm_mem_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x float>, <8 x float>* %vp @@ -3132,16 +2890,14 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x ; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x float>, <8 x float>* %vp @@ -3155,16 +2911,14 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x fl ; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xfloat_perm_mem_mask1: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x float>, <8 x float>* %vp @@ -3178,16 +2932,14 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x ; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask1: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x float>, <8 x float>* %vp @@ -3201,16 +2953,14 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x fl ; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xfloat_perm_mem_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x float>, <8 x float>* %vp @@ -3224,16 +2974,14 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x ; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x float>, <8 x float>* %vp @@ -3263,16 +3011,14 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x fl ; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xfloat_perm_mem_mask3: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x float>, <8 x float>* %vp @@ -3286,16 +3032,14 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x ; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0] sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask3: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0] sched: [7:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x float>, <8 x float>* %vp @@ -3324,8 +3068,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask0(<16 x float> %vec, <16 x fl ; GENERIC-LABEL: test_masked_16xfloat_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3333,8 +3076,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask0(<16 x float> %vec, <16 x fl ; SKX-LABEL: test_masked_16xfloat_perm_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] ; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -3348,16 +3090,14 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask0(<16 x float> %vec, <16 x ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xfloat_perm_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7> @@ -3369,8 +3109,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask1(<16 x float> %vec, <16 x fl ; GENERIC-LABEL: test_masked_16xfloat_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3378,8 +3117,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask1(<16 x float> %vec, <16 x fl ; SKX-LABEL: test_masked_16xfloat_perm_mask1: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] ; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -3393,16 +3131,14 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask1(<16 x float> %vec, <16 x ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xfloat_perm_mask1: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 11, i32 10, i32 4, i32 10, i32 4, i32 5, i32 8, i32 11, i32 2, i32 0, i32 10, i32 0, i32 0, i32 3, i32 10, i32 1> @@ -3414,8 +3150,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask2(<16 x float> %vec, <16 x fl ; GENERIC-LABEL: test_masked_16xfloat_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3423,8 +3158,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask2(<16 x float> %vec, <16 x fl ; SKX-LABEL: test_masked_16xfloat_perm_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] ; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -3438,16 +3172,14 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask2(<16 x float> %vec, <16 x ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xfloat_perm_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 15, i32 6, i32 14, i32 3, i32 6, i32 5, i32 2, i32 5, i32 15, i32 11, i32 6, i32 6, i32 4, i32 8, i32 11> @@ -3474,8 +3206,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask3(<16 x float> %vec, <16 x fl ; GENERIC-LABEL: test_masked_16xfloat_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3483,8 +3214,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask3(<16 x float> %vec, <16 x fl ; SKX-LABEL: test_masked_16xfloat_perm_mask3: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] ; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -3498,16 +3228,14 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask3(<16 x float> %vec, <16 x ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xfloat_perm_mask3: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3> @@ -3535,16 +3263,14 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 ; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xfloat_perm_mem_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x float>, <16 x float>* %vp @@ -3558,16 +3284,14 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask0(<16 x float>* %vp, <1 ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x float>, <16 x float>* %vp @@ -3581,16 +3305,14 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 ; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xfloat_perm_mem_mask1: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x float>, <16 x float>* %vp @@ -3604,16 +3326,14 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask1(<16 x float>* %vp, <1 ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask1: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x float>, <16 x float>* %vp @@ -3627,16 +3347,14 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 ; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xfloat_perm_mem_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x float>, <16 x float>* %vp @@ -3650,16 +3368,14 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask2(<16 x float>* %vp, <1 ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x float>, <16 x float>* %vp @@ -3689,16 +3405,14 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 ; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xfloat_perm_mem_mask3: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x float>, <16 x float>* %vp @@ -3712,16 +3426,14 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask3(<16 x float>* %vp, <1 ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask3: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x float>, <16 x float>* %vp @@ -3747,16 +3459,14 @@ define <4 x double> @test_4xdouble_perm_mask0(<4 x double> %vec) { define <4 x double> @test_masked_4xdouble_perm_mask0(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_masked_4xdouble_perm_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xdouble_perm_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2] sched: [3:1.00] ; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -3769,15 +3479,13 @@ define <4 x double> @test_masked_4xdouble_perm_mask0(<4 x double> %vec, <4 x dou define <4 x double> @test_masked_z_4xdouble_perm_mask0(<4 x double> %vec, <4 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,2] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xdouble_perm_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,2] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2> @@ -3788,16 +3496,14 @@ define <4 x double> @test_masked_z_4xdouble_perm_mask0(<4 x double> %vec, <4 x i define <4 x double> @test_masked_4xdouble_perm_mask1(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_masked_4xdouble_perm_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xdouble_perm_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0] sched: [3:1.00] ; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -3810,15 +3516,13 @@ define <4 x double> @test_masked_4xdouble_perm_mask1(<4 x double> %vec, <4 x dou define <4 x double> @test_masked_z_4xdouble_perm_mask1(<4 x double> %vec, <4 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xdouble_perm_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0> @@ -3829,16 +3533,14 @@ define <4 x double> @test_masked_z_4xdouble_perm_mask1(<4 x double> %vec, <4 x i define <4 x double> @test_masked_4xdouble_perm_mask2(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_masked_4xdouble_perm_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xdouble_perm_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1] sched: [3:1.00] ; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -3851,15 +3553,13 @@ define <4 x double> @test_masked_4xdouble_perm_mask2(<4 x double> %vec, <4 x dou define <4 x double> @test_masked_z_4xdouble_perm_mask2(<4 x double> %vec, <4 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,1] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xdouble_perm_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 3, i32 3, i32 1> @@ -3883,16 +3583,14 @@ define <4 x double> @test_4xdouble_perm_mask3(<4 x double> %vec) { define <4 x double> @test_masked_4xdouble_perm_mask3(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_masked_4xdouble_perm_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xdouble_perm_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2] sched: [3:1.00] ; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -3905,15 +3603,13 @@ define <4 x double> @test_masked_4xdouble_perm_mask3(<4 x double> %vec, <4 x dou define <4 x double> @test_masked_z_4xdouble_perm_mask3(<4 x double> %vec, <4 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,2] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xdouble_perm_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,2] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2> @@ -3938,15 +3634,13 @@ define <4 x double> @test_4xdouble_perm_mem_mask0(<4 x double>* %vp) { define <4 x double> @test_masked_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xdouble_perm_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x double>, <4 x double>* %vp @@ -3959,15 +3653,13 @@ define <4 x double> @test_masked_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x define <4 x double> @test_masked_z_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x double>, <4 x double>* %vp @@ -3980,15 +3672,13 @@ define <4 x double> @test_masked_z_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 define <4 x double> @test_masked_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xdouble_perm_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x double>, <4 x double>* %vp @@ -4001,15 +3691,13 @@ define <4 x double> @test_masked_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x define <4 x double> @test_masked_z_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x double>, <4 x double>* %vp @@ -4022,15 +3710,13 @@ define <4 x double> @test_masked_z_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 define <4 x double> @test_masked_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xdouble_perm_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x double>, <4 x double>* %vp @@ -4043,15 +3729,13 @@ define <4 x double> @test_masked_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x define <4 x double> @test_masked_z_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x double>, <4 x double>* %vp @@ -4078,15 +3762,13 @@ define <4 x double> @test_4xdouble_perm_mem_mask3(<4 x double>* %vp) { define <4 x double> @test_masked_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xdouble_perm_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x double>, <4 x double>* %vp @@ -4099,15 +3781,13 @@ define <4 x double> @test_masked_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x define <4 x double> @test_masked_z_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x double>, <4 x double>* %vp @@ -4136,8 +3816,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask0(<8 x double> %vec, <8 x dou ; GENERIC-LABEL: test_masked_8xdouble_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -4145,8 +3824,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask0(<8 x double> %vec, <8 x dou ; SKX-LABEL: test_masked_8xdouble_perm_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] ; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -4160,16 +3838,14 @@ define <8 x double> @test_masked_z_8xdouble_perm_mask0(<8 x double> %vec, <8 x i ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4> @@ -4180,16 +3856,14 @@ define <8 x double> @test_masked_z_8xdouble_perm_mask0(<8 x double> %vec, <8 x i define <8 x double> @test_masked_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_imm_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6] sched: [3:1.00] ; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -4202,15 +3876,13 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x define <8 x double> @test_masked_z_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,2,7,4,4,6] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,2,7,4,4,6] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 2, i32 7, i32 4, i32 4, i32 6> @@ -4222,8 +3894,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask2(<8 x double> %vec, <8 x dou ; GENERIC-LABEL: test_masked_8xdouble_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -4231,8 +3902,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask2(<8 x double> %vec, <8 x dou ; SKX-LABEL: test_masked_8xdouble_perm_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] ; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -4246,16 +3916,14 @@ define <8 x double> @test_masked_z_8xdouble_perm_mask2(<8 x double> %vec, <8 x i ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 5, i32 1, i32 7> @@ -4279,16 +3947,14 @@ define <8 x double> @test_8xdouble_perm_imm_mask3(<8 x double> %vec) { define <8 x double> @test_masked_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_imm_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00] ; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -4301,15 +3967,13 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x define <8 x double> @test_masked_z_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4> @@ -4321,8 +3985,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask4(<8 x double> %vec, <8 x dou ; GENERIC-LABEL: test_masked_8xdouble_perm_mask4: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -4330,8 +3993,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask4(<8 x double> %vec, <8 x dou ; SKX-LABEL: test_masked_8xdouble_perm_mask4: ; SKX: # %bb.0: ; SKX-NEXT: vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] ; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -4345,16 +4007,14 @@ define <8 x double> @test_masked_z_8xdouble_perm_mask4(<8 x double> %vec, <8 x i ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask4: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_mask4: ; SKX: # %bb.0: ; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 5, i32 3, i32 4, i32 6, i32 5, i32 7, i32 1> @@ -4365,16 +4025,14 @@ define <8 x double> @test_masked_z_8xdouble_perm_mask4(<8 x double> %vec, <8 x i define <8 x double> @test_masked_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask5: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_imm_mask5: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7] sched: [3:1.00] ; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -4387,15 +4045,13 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x define <8 x double> @test_masked_z_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask5: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,2,3,7,7,6,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask5: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,2,3,7,7,6,7] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7> @@ -4422,8 +4078,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask6(<8 x double> %vec, <8 x dou ; GENERIC-LABEL: test_masked_8xdouble_perm_mask6: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -4431,8 +4086,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask6(<8 x double> %vec, <8 x dou ; SKX-LABEL: test_masked_8xdouble_perm_mask6: ; SKX: # %bb.0: ; SKX-NEXT: vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] ; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -4446,16 +4100,14 @@ define <8 x double> @test_masked_z_8xdouble_perm_mask6(<8 x double> %vec, <8 x i ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask6: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_mask6: ; SKX: # %bb.0: ; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2> @@ -4466,16 +4118,14 @@ define <8 x double> @test_masked_z_8xdouble_perm_mask6(<8 x double> %vec, <8 x i define <8 x double> @test_masked_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask7: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_imm_mask7: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6] sched: [3:1.00] ; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -4488,15 +4138,13 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x define <8 x double> @test_masked_z_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask7: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,2,7,5,7,6] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask7: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,2,7,5,7,6] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 2, i32 7, i32 5, i32 7, i32 6> @@ -4524,16 +4172,14 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x ; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_mem_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp @@ -4547,16 +4193,14 @@ define <8 x double> @test_masked_z_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp @@ -4569,15 +4213,13 @@ define <8 x double> @test_masked_z_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp @@ -4590,15 +4232,13 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp, define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp @@ -4612,16 +4252,14 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x ; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_mem_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp @@ -4635,16 +4273,14 @@ define <8 x double> @test_masked_z_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp @@ -4671,15 +4307,13 @@ define <8 x double> @test_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp) { define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp @@ -4692,15 +4326,13 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp, define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp @@ -4714,16 +4346,14 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x ; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask4: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_mem_mask4: ; SKX: # %bb.0: ; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp @@ -4737,16 +4367,14 @@ define <8 x double> @test_masked_z_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask4: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask4: ; SKX: # %bb.0: ; SKX-NEXT: vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp @@ -4759,15 +4387,13 @@ define <8 x double> @test_masked_z_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask5: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask5: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp @@ -4780,15 +4406,13 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp, define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp @@ -4818,16 +4442,14 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x ; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask6: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_mem_mask6: ; SKX: # %bb.0: ; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp @@ -4841,16 +4463,14 @@ define <8 x double> @test_masked_z_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask6: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask6: ; SKX: # %bb.0: ; SKX-NEXT: vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [8:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp @@ -4863,15 +4483,13 @@ define <8 x double> @test_masked_z_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask7: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask7: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp @@ -4884,15 +4502,13 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp, define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp @@ -4918,16 +4534,14 @@ define <16 x i8> @test_16xi8_perm_mask0(<16 x i8> %vec) { define <16 x i8> @test_masked_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) { ; GENERIC-LABEL: test_masked_16xi8_perm_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi8_perm_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00] ; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -4940,15 +4554,13 @@ define <16 x i8> @test_masked_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %vec2, define <16 x i8> @test_masked_z_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %mask) { ; GENERIC-LABEL: test_masked_z_16xi8_perm_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi8_perm_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14> @@ -4959,16 +4571,14 @@ define <16 x i8> @test_masked_z_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %mask define <16 x i8> @test_masked_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) { ; GENERIC-LABEL: test_masked_16xi8_perm_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi8_perm_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:1.00] ; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -4981,15 +4591,13 @@ define <16 x i8> @test_masked_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %vec2, define <16 x i8> @test_masked_z_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %mask) { ; GENERIC-LABEL: test_masked_z_16xi8_perm_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi8_perm_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0> @@ -5000,16 +4608,14 @@ define <16 x i8> @test_masked_z_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %mask define <16 x i8> @test_masked_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) { ; GENERIC-LABEL: test_masked_16xi8_perm_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi8_perm_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:1.00] ; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -5022,15 +4628,13 @@ define <16 x i8> @test_masked_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %vec2, define <16 x i8> @test_masked_z_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %mask) { ; GENERIC-LABEL: test_masked_z_16xi8_perm_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi8_perm_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7> @@ -5054,16 +4658,14 @@ define <16 x i8> @test_16xi8_perm_mask3(<16 x i8> %vec) { define <16 x i8> @test_masked_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) { ; GENERIC-LABEL: test_masked_16xi8_perm_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi8_perm_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00] ; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -5076,15 +4678,13 @@ define <16 x i8> @test_masked_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %vec2, define <16 x i8> @test_masked_z_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %mask) { ; GENERIC-LABEL: test_masked_z_16xi8_perm_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi8_perm_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6> @@ -5112,16 +4712,14 @@ define <16 x i8> @test_masked_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> %ve ; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi8_perm_mem_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i8>, <16 x i8>* %vp @@ -5135,16 +4733,14 @@ define <16 x i8> @test_masked_z_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> % ; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %xmm0, %xmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %xmm0, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i8>, <16 x i8>* %vp @@ -5158,16 +4754,14 @@ define <16 x i8> @test_masked_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> %ve ; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi8_perm_mem_mask1: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i8>, <16 x i8>* %vp @@ -5181,16 +4775,14 @@ define <16 x i8> @test_masked_z_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> % ; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %xmm0, %xmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask1: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %xmm0, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i8>, <16 x i8>* %vp @@ -5204,16 +4796,14 @@ define <16 x i8> @test_masked_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> %ve ; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi8_perm_mem_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i8>, <16 x i8>* %vp @@ -5227,16 +4817,14 @@ define <16 x i8> @test_masked_z_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> % ; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %xmm0, %xmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %xmm0, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i8>, <16 x i8>* %vp @@ -5266,16 +4854,14 @@ define <16 x i8> @test_masked_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> %ve ; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi8_perm_mem_mask3: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i8>, <16 x i8>* %vp @@ -5289,16 +4875,14 @@ define <16 x i8> @test_masked_z_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> % ; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %xmm0, %xmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask3: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %xmm0, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i8>, <16 x i8>* %vp @@ -5324,16 +4908,14 @@ define <32 x i8> @test_32xi8_perm_mask0(<32 x i8> %vec) { define <32 x i8> @test_masked_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) { ; GENERIC-LABEL: test_masked_32xi8_perm_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi8_perm_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -5346,15 +4928,13 @@ define <32 x i8> @test_masked_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %vec2, define <32 x i8> @test_masked_z_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %mask) { ; GENERIC-LABEL: test_masked_z_32xi8_perm_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi8_perm_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21> @@ -5365,16 +4945,14 @@ define <32 x i8> @test_masked_z_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %mask define <32 x i8> @test_masked_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) { ; GENERIC-LABEL: test_masked_32xi8_perm_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi8_perm_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -5387,15 +4965,13 @@ define <32 x i8> @test_masked_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %vec2, define <32 x i8> @test_masked_z_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %mask) { ; GENERIC-LABEL: test_masked_z_32xi8_perm_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi8_perm_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 0, i32 4, i32 3, i32 15, i32 5, i32 4, i32 5, i32 15, i32 10, i32 9, i32 11, i32 6, i32 6, i32 10, i32 0, i32 3, i32 21, i32 19, i32 26, i32 22, i32 30, i32 25, i32 22, i32 22, i32 27, i32 22, i32 26, i32 16, i32 23, i32 20, i32 18, i32 24> @@ -5406,16 +4982,14 @@ define <32 x i8> @test_masked_z_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %mask define <32 x i8> @test_masked_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) { ; GENERIC-LABEL: test_masked_32xi8_perm_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi8_perm_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -5428,15 +5002,13 @@ define <32 x i8> @test_masked_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %vec2, define <32 x i8> @test_masked_z_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %mask) { ; GENERIC-LABEL: test_masked_z_32xi8_perm_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi8_perm_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 7, i32 8, i32 12, i32 14, i32 7, i32 4, i32 7, i32 12, i32 14, i32 12, i32 3, i32 15, i32 10, i32 1, i32 11, i32 15, i32 22, i32 26, i32 21, i32 19, i32 27, i32 16, i32 29, i32 24, i32 17, i32 17, i32 26, i32 29, i32 20, i32 31, i32 17, i32 29> @@ -5460,16 +5032,14 @@ define <32 x i8> @test_32xi8_perm_mask3(<32 x i8> %vec) { define <32 x i8> @test_masked_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) { ; GENERIC-LABEL: test_masked_32xi8_perm_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi8_perm_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -5482,15 +5052,13 @@ define <32 x i8> @test_masked_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %vec2, define <32 x i8> @test_masked_z_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %mask) { ; GENERIC-LABEL: test_masked_z_32xi8_perm_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi8_perm_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18> @@ -5518,16 +5086,14 @@ define <32 x i8> @test_masked_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> %ve ; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi8_perm_mem_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i8>, <32 x i8>* %vp @@ -5541,16 +5107,14 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> % ; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i8>, <32 x i8>* %vp @@ -5564,16 +5128,14 @@ define <32 x i8> @test_masked_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> %ve ; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi8_perm_mem_mask1: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i8>, <32 x i8>* %vp @@ -5587,16 +5149,14 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> % ; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask1: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i8>, <32 x i8>* %vp @@ -5610,16 +5170,14 @@ define <32 x i8> @test_masked_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> %ve ; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi8_perm_mem_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i8>, <32 x i8>* %vp @@ -5633,16 +5191,14 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> % ; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i8>, <32 x i8>* %vp @@ -5672,16 +5228,14 @@ define <32 x i8> @test_masked_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> %ve ; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi8_perm_mem_mask3: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i8>, <32 x i8>* %vp @@ -5695,16 +5249,14 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> % ; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask3: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i8>, <32 x i8>* %vp @@ -5730,16 +5282,14 @@ define <64 x i8> @test_64xi8_perm_mask0(<64 x i8> %vec) { define <64 x i8> @test_masked_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) { ; GENERIC-LABEL: test_masked_64xi8_perm_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_64xi8_perm_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -5752,15 +5302,13 @@ define <64 x i8> @test_masked_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %vec2, define <64 x i8> @test_masked_z_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %mask) { ; GENERIC-LABEL: test_masked_z_64xi8_perm_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_64xi8_perm_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62> @@ -5771,16 +5319,14 @@ define <64 x i8> @test_masked_z_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %mask define <64 x i8> @test_masked_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) { ; GENERIC-LABEL: test_masked_64xi8_perm_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_64xi8_perm_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -5793,15 +5339,13 @@ define <64 x i8> @test_masked_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %vec2, define <64 x i8> @test_masked_z_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %mask) { ; GENERIC-LABEL: test_masked_z_64xi8_perm_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_64xi8_perm_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 7, i32 14, i32 15, i32 10, i32 9, i32 3, i32 1, i32 13, i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 30, i32 30, i32 22, i32 17, i32 28, i32 27, i32 16, i32 23, i32 26, i32 16, i32 30, i32 31, i32 27, i32 17, i32 17, i32 21, i32 32, i32 37, i32 32, i32 47, i32 45, i32 33, i32 46, i32 35, i32 35, i32 42, i32 47, i32 33, i32 32, i32 37, i32 32, i32 41, i32 61, i32 50, i32 49, i32 53, i32 63, i32 50, i32 63, i32 53, i32 55, i32 52, i32 62, i32 63, i32 58, i32 50, i32 63, i32 49> @@ -5812,16 +5356,14 @@ define <64 x i8> @test_masked_z_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %mask define <64 x i8> @test_masked_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) { ; GENERIC-LABEL: test_masked_64xi8_perm_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_64xi8_perm_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -5834,15 +5376,13 @@ define <64 x i8> @test_masked_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %vec2, define <64 x i8> @test_masked_z_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %mask) { ; GENERIC-LABEL: test_masked_z_64xi8_perm_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_64xi8_perm_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12, i32 4, i32 6, i32 0, i32 2, i32 0, i32 1, i32 1, i32 6, i32 24, i32 27, i32 18, i32 22, i32 26, i32 17, i32 23, i32 21, i32 31, i32 16, i32 22, i32 22, i32 27, i32 21, i32 19, i32 20, i32 39, i32 47, i32 44, i32 36, i32 40, i32 43, i32 44, i32 39, i32 38, i32 44, i32 38, i32 35, i32 39, i32 46, i32 34, i32 39, i32 58, i32 55, i32 51, i32 48, i32 59, i32 57, i32 48, i32 52, i32 60, i32 58, i32 56, i32 50, i32 59, i32 55, i32 58, i32 60> @@ -5866,16 +5406,14 @@ define <64 x i8> @test_64xi8_perm_mask3(<64 x i8> %vec) { define <64 x i8> @test_masked_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) { ; GENERIC-LABEL: test_masked_64xi8_perm_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_64xi8_perm_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -5888,15 +5426,13 @@ define <64 x i8> @test_masked_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %vec2, define <64 x i8> @test_masked_z_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %mask) { ; GENERIC-LABEL: test_masked_z_64xi8_perm_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_64xi8_perm_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61> @@ -5924,16 +5460,14 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %ve ; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_64xi8_perm_mem_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <64 x i8>, <64 x i8>* %vp @@ -5947,16 +5481,14 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> % ; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask0: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <64 x i8>, <64 x i8>* %vp @@ -5970,16 +5502,14 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %ve ; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_64xi8_perm_mem_mask1: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <64 x i8>, <64 x i8>* %vp @@ -5993,16 +5523,14 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> % ; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask1: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <64 x i8>, <64 x i8>* %vp @@ -6016,16 +5544,14 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %ve ; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_64xi8_perm_mem_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <64 x i8>, <64 x i8>* %vp @@ -6039,16 +5565,14 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> % ; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask2: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <64 x i8>, <64 x i8>* %vp @@ -6078,16 +5602,14 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %ve ; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_64xi8_perm_mem_mask3: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <64 x i8>, <64 x i8>* %vp @@ -6101,16 +5623,14 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> % ; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmb %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask3: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmb %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <64 x i8>, <64 x i8>* %vp @@ -6136,16 +5656,14 @@ define <8 x i16> @test_8xi16_perm_high_mask0(<8 x i16> %vec) { define <8 x i16> @test_masked_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; GENERIC-LABEL: test_masked_8xi16_perm_high_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi16_perm_high_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00] ; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -6158,15 +5676,13 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %v define <8 x i16> @test_masked_z_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_high_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6> @@ -6177,16 +5693,14 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> define <8 x i16> @test_masked_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; GENERIC-LABEL: test_masked_8xi16_perm_low_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi16_perm_low_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00] ; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -6199,15 +5713,13 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %ve define <8 x i16> @test_masked_z_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_low_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7> @@ -6218,16 +5730,14 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> % define <8 x i16> @test_masked_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; GENERIC-LABEL: test_masked_8xi16_perm_high_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi16_perm_high_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00] ; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -6240,15 +5750,13 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %v define <8 x i16> @test_masked_z_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_high_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 4, i32 5> @@ -6272,16 +5780,14 @@ define <8 x i16> @test_8xi16_perm_low_mask3(<8 x i16> %vec) { define <8 x i16> @test_masked_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; GENERIC-LABEL: test_masked_8xi16_perm_low_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi16_perm_low_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00] ; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -6294,15 +5800,13 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %ve define <8 x i16> @test_masked_z_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_low_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7> @@ -6313,16 +5817,14 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> % define <8 x i16> @test_masked_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; GENERIC-LABEL: test_masked_8xi16_perm_high_mask4: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi16_perm_high_mask4: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00] ; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -6335,15 +5837,13 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %v define <8 x i16> @test_masked_z_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask4: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_high_mask4: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 7, i32 6> @@ -6354,16 +5854,14 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> define <8 x i16> @test_masked_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; GENERIC-LABEL: test_masked_8xi16_perm_low_mask5: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi16_perm_low_mask5: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00] ; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -6376,15 +5874,13 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %ve define <8 x i16> @test_masked_z_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask5: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_low_mask5: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 1, i32 4, i32 5, i32 6, i32 7> @@ -6408,16 +5904,14 @@ define <8 x i16> @test_8xi16_perm_high_mask6(<8 x i16> %vec) { define <8 x i16> @test_masked_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; GENERIC-LABEL: test_masked_8xi16_perm_high_mask6: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi16_perm_high_mask6: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00] ; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -6430,15 +5924,13 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %v define <8 x i16> @test_masked_z_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask6: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_high_mask6: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5> @@ -6449,16 +5941,14 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> define <8 x i16> @test_masked_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; GENERIC-LABEL: test_masked_8xi16_perm_low_mask7: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi16_perm_low_mask7: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00] ; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -6471,15 +5961,13 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %ve define <8 x i16> @test_masked_z_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask7: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_low_mask7: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7> @@ -6504,15 +5992,13 @@ define <8 x i16> @test_8xi16_perm_high_mem_mask0(<8 x i16>* %vp) { define <8 x i16> @test_masked_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp @@ -6525,15 +6011,13 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16 define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp @@ -6546,15 +6030,13 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i define <8 x i16> @test_masked_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp @@ -6567,15 +6049,13 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp @@ -6588,15 +6068,13 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i1 define <8 x i16> @test_masked_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp @@ -6609,15 +6087,13 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16 define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp @@ -6644,15 +6120,13 @@ define <8 x i16> @test_8xi16_perm_low_mem_mask3(<8 x i16>* %vp) { define <8 x i16> @test_masked_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp @@ -6665,15 +6139,13 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp @@ -6686,15 +6158,13 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i1 define <8 x i16> @test_masked_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask4: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask4: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp @@ -6707,15 +6177,13 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16 define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask4: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask4: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp @@ -6728,15 +6196,13 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i define <8 x i16> @test_masked_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask5: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask5: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp @@ -6749,15 +6215,13 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask5: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask5: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp @@ -6784,15 +6248,13 @@ define <8 x i16> @test_8xi16_perm_high_mem_mask6(<8 x i16>* %vp) { define <8 x i16> @test_masked_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask6: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask6: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp @@ -6805,15 +6267,13 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16 define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask6: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask6: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp @@ -6826,15 +6286,13 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i define <8 x i16> @test_masked_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask7: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask7: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp @@ -6847,15 +6305,13 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask7: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask7: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp @@ -6881,16 +6337,14 @@ define <16 x i16> @test_16xi16_perm_high_mask0(<16 x i16> %vec) { define <16 x i16> @test_masked_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; GENERIC-LABEL: test_masked_16xi16_perm_high_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_high_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -6903,15 +6357,13 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16 define <16 x i16> @test_masked_z_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_high_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12> @@ -6922,16 +6374,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i define <16 x i16> @test_masked_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; GENERIC-LABEL: test_masked_16xi16_perm_low_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_low_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -6944,15 +6394,13 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> define <16 x i16> @test_masked_z_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_low_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 8, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15> @@ -6963,16 +6411,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i1 define <16 x i16> @test_masked_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; GENERIC-LABEL: test_masked_16xi16_perm_high_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_high_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -6985,15 +6431,13 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16 define <16 x i16> @test_masked_z_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_high_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 5, i32 5, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 13, i32 13, i32 13> @@ -7017,16 +6461,14 @@ define <16 x i16> @test_16xi16_perm_low_mask3(<16 x i16> %vec) { define <16 x i16> @test_masked_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; GENERIC-LABEL: test_masked_16xi16_perm_low_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_low_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -7039,15 +6481,13 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> define <16 x i16> @test_masked_z_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_low_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15> @@ -7058,16 +6498,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i1 define <16 x i16> @test_masked_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; GENERIC-LABEL: test_masked_16xi16_perm_high_mask4: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_high_mask4: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -7080,15 +6518,13 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16 define <16 x i16> @test_masked_z_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask4: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_high_mask4: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 12, i32 15> @@ -7099,16 +6535,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i define <16 x i16> @test_masked_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; GENERIC-LABEL: test_masked_16xi16_perm_low_mask5: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_low_mask5: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -7121,15 +6555,13 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> define <16 x i16> @test_masked_z_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask5: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_low_mask5: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15> @@ -7153,16 +6585,14 @@ define <16 x i16> @test_16xi16_perm_high_mask6(<16 x i16> %vec) { define <16 x i16> @test_masked_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; GENERIC-LABEL: test_masked_16xi16_perm_high_mask6: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_high_mask6: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -7175,15 +6605,13 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16 define <16 x i16> @test_masked_z_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask6: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_high_mask6: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13> @@ -7194,16 +6622,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i define <16 x i16> @test_masked_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; GENERIC-LABEL: test_masked_16xi16_perm_low_mask7: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_low_mask7: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -7216,15 +6642,13 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> define <16 x i16> @test_masked_z_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask7: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_low_mask7: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 10, i32 12, i32 13, i32 14, i32 15> @@ -7249,15 +6673,13 @@ define <16 x i16> @test_16xi16_perm_high_mem_mask0(<16 x i16>* %vp) { define <16 x i16> @test_masked_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp @@ -7270,15 +6692,13 @@ define <16 x i16> @test_masked_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp @@ -7291,15 +6711,13 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 define <16 x i16> @test_masked_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp @@ -7312,15 +6730,13 @@ define <16 x i16> @test_masked_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp @@ -7333,15 +6749,13 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 define <16 x i16> @test_masked_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp @@ -7354,15 +6768,13 @@ define <16 x i16> @test_masked_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp @@ -7389,15 +6801,13 @@ define <16 x i16> @test_16xi16_perm_low_mem_mask3(<16 x i16>* %vp) { define <16 x i16> @test_masked_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp @@ -7410,15 +6820,13 @@ define <16 x i16> @test_masked_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp @@ -7431,15 +6839,13 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 define <16 x i16> @test_masked_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask4: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask4: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp @@ -7452,15 +6858,13 @@ define <16 x i16> @test_masked_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask4: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask4: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp @@ -7473,15 +6877,13 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 define <16 x i16> @test_masked_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask5: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask5: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp @@ -7494,15 +6896,13 @@ define <16 x i16> @test_masked_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask5: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask5: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp @@ -7529,15 +6929,13 @@ define <16 x i16> @test_16xi16_perm_high_mem_mask6(<16 x i16>* %vp) { define <16 x i16> @test_masked_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask6: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask6: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp @@ -7550,15 +6948,13 @@ define <16 x i16> @test_masked_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask6: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask6: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp @@ -7571,15 +6967,13 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 define <16 x i16> @test_masked_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask7: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask7: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp @@ -7592,15 +6986,13 @@ define <16 x i16> @test_masked_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask7: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask7: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp @@ -7626,16 +7018,14 @@ define <32 x i16> @test_32xi16_perm_high_mask0(<32 x i16> %vec) { define <32 x i16> @test_masked_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_32xi16_perm_high_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_high_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -7648,15 +7038,13 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16 define <32 x i16> @test_masked_z_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_high_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28> @@ -7667,16 +7055,14 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i define <32 x i16> @test_masked_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_32xi16_perm_low_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_low_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -7689,15 +7075,13 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> define <32 x i16> @test_masked_z_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_low_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31> @@ -7708,16 +7092,14 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i1 define <32 x i16> @test_masked_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_32xi16_perm_high_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_high_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -7730,15 +7112,13 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16 define <32 x i16> @test_masked_z_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_high_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 14, i32 12, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 22, i32 20, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 30, i32 28, i32 31> @@ -7762,16 +7142,14 @@ define <32 x i16> @test_32xi16_perm_low_mask3(<32 x i16> %vec) { define <32 x i16> @test_masked_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_32xi16_perm_low_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_low_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -7784,15 +7162,13 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> define <32 x i16> @test_masked_z_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_low_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31> @@ -7803,16 +7179,14 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i1 define <32 x i16> @test_masked_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_32xi16_perm_high_mask4: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_high_mask4: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -7825,15 +7199,13 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16 define <32 x i16> @test_masked_z_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask4: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_high_mask4: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 23, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 31, i32 29, i32 30> @@ -7844,16 +7216,14 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i define <32 x i16> @test_masked_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_32xi16_perm_low_mask5: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_low_mask5: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -7866,15 +7236,13 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> define <32 x i16> @test_masked_z_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask5: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_low_mask5: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31> @@ -7898,16 +7266,14 @@ define <32 x i16> @test_32xi16_perm_high_mask6(<32 x i16> %vec) { define <32 x i16> @test_masked_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_32xi16_perm_high_mask6: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_high_mask6: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -7920,15 +7286,13 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16 define <32 x i16> @test_masked_z_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask6: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_high_mask6: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30> @@ -7939,16 +7303,14 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i define <32 x i16> @test_masked_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_32xi16_perm_low_mask7: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_low_mask7: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -7961,15 +7323,13 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> define <32 x i16> @test_masked_z_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask7: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_low_mask7: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 0, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31> @@ -7994,15 +7354,13 @@ define <32 x i16> @test_32xi16_perm_high_mem_mask0(<32 x i16>* %vp) { define <32 x i16> @test_masked_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp @@ -8015,15 +7373,13 @@ define <32 x i16> @test_masked_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp @@ -8036,15 +7392,13 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 define <32 x i16> @test_masked_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp @@ -8057,15 +7411,13 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp @@ -8078,15 +7430,13 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 define <32 x i16> @test_masked_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp @@ -8099,15 +7449,13 @@ define <32 x i16> @test_masked_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp @@ -8134,15 +7482,13 @@ define <32 x i16> @test_32xi16_perm_low_mem_mask3(<32 x i16>* %vp) { define <32 x i16> @test_masked_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp @@ -8155,15 +7501,13 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp @@ -8176,15 +7520,13 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 define <32 x i16> @test_masked_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask4: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask4: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp @@ -8197,15 +7539,13 @@ define <32 x i16> @test_masked_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask4: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask4: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp @@ -8219,16 +7559,14 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x ; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask5: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [5:1.00] -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1} # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask5: ; SKX: # %bb.0: ; SKX-NEXT: vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00] -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp @@ -8242,16 +7580,14 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask5: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [5:1.00] -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask5: ; SKX: # %bb.0: ; SKX-NEXT: vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00] -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp @@ -8278,15 +7614,13 @@ define <32 x i16> @test_32xi16_perm_high_mem_mask6(<32 x i16>* %vp) { define <32 x i16> @test_masked_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask6: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask6: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp @@ -8299,15 +7633,13 @@ define <32 x i16> @test_masked_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask6: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask6: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp @@ -8320,15 +7652,13 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 define <32 x i16> @test_masked_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask7: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask7: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp @@ -8341,15 +7671,13 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask7: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask7: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp @@ -8375,16 +7703,14 @@ define <4 x i32> @test_4xi32_perm_mask0(<4 x i32> %vec) { define <4 x i32> @test_masked_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; GENERIC-LABEL: test_masked_4xi32_perm_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xi32_perm_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0] sched: [1:1.00] ; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -8397,15 +7723,13 @@ define <4 x i32> @test_masked_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %vec2, define <4 x i32> @test_masked_z_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %mask) { ; GENERIC-LABEL: test_masked_z_4xi32_perm_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi32_perm_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0> @@ -8416,16 +7740,14 @@ define <4 x i32> @test_masked_z_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %mask define <4 x i32> @test_masked_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; GENERIC-LABEL: test_masked_4xi32_perm_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xi32_perm_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0] sched: [1:1.00] ; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -8438,15 +7760,13 @@ define <4 x i32> @test_masked_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %vec2, define <4 x i32> @test_masked_z_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %mask) { ; GENERIC-LABEL: test_masked_z_4xi32_perm_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi32_perm_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 0> @@ -8457,16 +7777,14 @@ define <4 x i32> @test_masked_z_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %mask define <4 x i32> @test_masked_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; GENERIC-LABEL: test_masked_4xi32_perm_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xi32_perm_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0] sched: [1:1.00] ; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -8479,15 +7797,13 @@ define <4 x i32> @test_masked_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %vec2, define <4 x i32> @test_masked_z_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %mask) { ; GENERIC-LABEL: test_masked_z_4xi32_perm_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi32_perm_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 0> @@ -8511,16 +7827,14 @@ define <4 x i32> @test_4xi32_perm_mask3(<4 x i32> %vec) { define <4 x i32> @test_masked_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; GENERIC-LABEL: test_masked_4xi32_perm_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xi32_perm_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3] sched: [1:1.00] ; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -8533,15 +7847,13 @@ define <4 x i32> @test_masked_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %vec2, define <4 x i32> @test_masked_z_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %mask) { ; GENERIC-LABEL: test_masked_z_4xi32_perm_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi32_perm_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3> @@ -8566,15 +7878,13 @@ define <4 x i32> @test_4xi32_perm_mem_mask0(<4 x i32>* %vp) { define <4 x i32> @test_masked_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xi32_perm_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i32>, <4 x i32>* %vp @@ -8587,15 +7897,13 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %ve define <4 x i32> @test_masked_z_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %mask) { ; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm0, %xmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i32>, <4 x i32>* %vp @@ -8608,15 +7916,13 @@ define <4 x i32> @test_masked_z_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> % define <4 x i32> @test_masked_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xi32_perm_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i32>, <4 x i32>* %vp @@ -8629,15 +7935,13 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %ve define <4 x i32> @test_masked_z_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %mask) { ; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm0, %xmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i32>, <4 x i32>* %vp @@ -8650,15 +7954,13 @@ define <4 x i32> @test_masked_z_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> % define <4 x i32> @test_masked_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xi32_perm_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i32>, <4 x i32>* %vp @@ -8671,15 +7973,13 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %ve define <4 x i32> @test_masked_z_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %mask) { ; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm0, %xmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i32>, <4 x i32>* %vp @@ -8706,15 +8006,13 @@ define <4 x i32> @test_4xi32_perm_mem_mask3(<4 x i32>* %vp) { define <4 x i32> @test_masked_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xi32_perm_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i32>, <4 x i32>* %vp @@ -8727,15 +8025,13 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %ve define <4 x i32> @test_masked_z_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %mask) { ; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm0, %xmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i32>, <4 x i32>* %vp @@ -8761,16 +8057,14 @@ define <8 x i32> @test2_8xi32_perm_mask0(<8 x i32> %vec) { define <8 x i32> @test2_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test2_masked_8xi32_perm_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_8xi32_perm_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -8783,15 +8077,13 @@ define <8 x i32> @test2_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, define <8 x i32> @test2_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask) { ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_8xi32_perm_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4> @@ -8802,16 +8094,14 @@ define <8 x i32> @test2_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mas define <8 x i32> @test2_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test2_masked_8xi32_perm_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_8xi32_perm_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -8824,15 +8114,13 @@ define <8 x i32> @test2_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, define <8 x i32> @test2_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask) { ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_8xi32_perm_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 3, i32 4, i32 7, i32 7, i32 7> @@ -8843,16 +8131,14 @@ define <8 x i32> @test2_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mas define <8 x i32> @test2_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test2_masked_8xi32_perm_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_8xi32_perm_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -8865,15 +8151,13 @@ define <8 x i32> @test2_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, define <8 x i32> @test2_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask) { ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_8xi32_perm_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 5, i32 6, i32 4, i32 7> @@ -8897,16 +8181,14 @@ define <8 x i32> @test2_8xi32_perm_mask3(<8 x i32> %vec) { define <8 x i32> @test2_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test2_masked_8xi32_perm_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_8xi32_perm_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -8919,15 +8201,13 @@ define <8 x i32> @test2_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, define <8 x i32> @test2_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask) { ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_8xi32_perm_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4> @@ -8952,15 +8232,13 @@ define <8 x i32> @test2_8xi32_perm_mem_mask0(<8 x i32>* %vp) { define <8 x i32> @test2_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_8xi32_perm_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp @@ -8973,15 +8251,13 @@ define <8 x i32> @test2_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %v define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %mask) { ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp @@ -8994,15 +8270,13 @@ define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> define <8 x i32> @test2_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_8xi32_perm_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp @@ -9015,15 +8289,13 @@ define <8 x i32> @test2_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %v define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %mask) { ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp @@ -9036,15 +8308,13 @@ define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> define <8 x i32> @test2_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_8xi32_perm_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp @@ -9057,15 +8327,13 @@ define <8 x i32> @test2_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %v define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %mask) { ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp @@ -9092,15 +8360,13 @@ define <8 x i32> @test2_8xi32_perm_mem_mask3(<8 x i32>* %vp) { define <8 x i32> @test2_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_8xi32_perm_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp @@ -9113,15 +8379,13 @@ define <8 x i32> @test2_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %v define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %mask) { ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp @@ -9147,16 +8411,14 @@ define <16 x i32> @test2_16xi32_perm_mask0(<16 x i32> %vec) { define <16 x i32> @test2_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test2_masked_16xi32_perm_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_16xi32_perm_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -9169,15 +8431,13 @@ define <16 x i32> @test2_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %v define <16 x i32> @test2_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) { ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_16xi32_perm_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12> @@ -9188,16 +8448,14 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> define <16 x i32> @test2_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test2_masked_16xi32_perm_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_16xi32_perm_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -9210,15 +8468,13 @@ define <16 x i32> @test2_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %v define <16 x i32> @test2_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %mask) { ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_16xi32_perm_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 0, i32 6, i32 4, i32 7, i32 4, i32 10, i32 8, i32 11, i32 8, i32 14, i32 12, i32 15, i32 12> @@ -9229,16 +8485,14 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> define <16 x i32> @test2_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test2_masked_16xi32_perm_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_16xi32_perm_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -9251,15 +8505,13 @@ define <16 x i32> @test2_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %v define <16 x i32> @test2_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %mask) { ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_16xi32_perm_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4, i32 9, i32 11, i32 11, i32 8, i32 13, i32 15, i32 15, i32 12> @@ -9283,16 +8535,14 @@ define <16 x i32> @test2_16xi32_perm_mask3(<16 x i32> %vec) { define <16 x i32> @test2_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test2_masked_16xi32_perm_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_16xi32_perm_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -9305,15 +8555,13 @@ define <16 x i32> @test2_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %v define <16 x i32> @test2_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %mask) { ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_16xi32_perm_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15> @@ -9338,15 +8586,13 @@ define <16 x i32> @test2_16xi32_perm_mem_mask0(<16 x i32>* %vp) { define <16 x i32> @test2_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_16xi32_perm_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp @@ -9359,15 +8605,13 @@ define <16 x i32> @test2_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32 define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %mask) { ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp @@ -9380,15 +8624,13 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i define <16 x i32> @test2_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_16xi32_perm_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp @@ -9401,15 +8643,13 @@ define <16 x i32> @test2_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32 define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %mask) { ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp @@ -9422,15 +8662,13 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i define <16 x i32> @test2_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_16xi32_perm_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp @@ -9443,15 +8681,13 @@ define <16 x i32> @test2_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32 define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %mask) { ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp @@ -9478,15 +8714,13 @@ define <16 x i32> @test2_16xi32_perm_mem_mask3(<16 x i32>* %vp) { define <16 x i32> @test2_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_16xi32_perm_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp @@ -9499,15 +8733,13 @@ define <16 x i32> @test2_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32 define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %mask) { ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp @@ -9533,16 +8765,14 @@ define <8 x float> @test2_8xfloat_shuff_mask0(<8 x float> %vec1, <8 x float> %ve define <8 x float> @test2_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00] ; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_8xfloat_masked_shuff_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] ; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -9555,15 +8785,13 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x flo define <8 x float> @test2_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> @@ -9574,16 +8802,14 @@ define <8 x float> @test2_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 define <8 x float> @test2_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00] ; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_8xfloat_masked_shuff_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] ; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -9596,15 +8822,13 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x flo define <8 x float> @test2_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> @@ -9615,16 +8839,14 @@ define <8 x float> @test2_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 define <8 x float> @test2_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_8xfloat_masked_shuff_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00] ; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -9637,15 +8859,13 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x flo define <8 x float> @test2_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> @@ -9669,16 +8889,14 @@ define <8 x float> @test2_8xfloat_shuff_mask3(<8 x float> %vec1, <8 x float> %ve define <8 x float> @test2_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00] ; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_8xfloat_masked_shuff_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] ; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -9691,15 +8909,13 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x flo define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> @@ -9724,16 +8940,14 @@ define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00] ; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -9747,15 +8961,13 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p @@ -9768,16 +8980,14 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00] ; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -9791,15 +9001,13 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p @@ -9812,16 +9020,14 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] ; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -9835,15 +9041,13 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p @@ -9870,16 +9074,14 @@ define <8 x float> @test_8xfloat_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] ; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -9893,15 +9095,13 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p @@ -9927,16 +9127,14 @@ define <16 x float> @test_16xfloat_shuff_mask0(<16 x float> %vec1, <16 x float> define <16 x float> @test_16xfloat_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_masked_shuff_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_shuff_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00] ; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -9949,15 +9147,13 @@ define <16 x float> @test_16xfloat_masked_shuff_mask0(<16 x float> %vec1, <16 x define <16 x float> @test_16xfloat_zero_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31> @@ -9968,16 +9164,14 @@ define <16 x float> @test_16xfloat_zero_masked_shuff_mask0(<16 x float> %vec1, < define <16 x float> @test_16xfloat_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_masked_shuff_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_shuff_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [3:1.00] ; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -9990,15 +9184,13 @@ define <16 x float> @test_16xfloat_masked_shuff_mask1(<16 x float> %vec1, <16 x define <16 x float> @test_16xfloat_zero_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 28, i32 29, i32 30, i32 31> @@ -10009,16 +9201,14 @@ define <16 x float> @test_16xfloat_zero_masked_shuff_mask1(<16 x float> %vec1, < define <16 x float> @test_16xfloat_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_masked_shuff_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_shuff_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [3:1.00] ; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -10031,15 +9221,13 @@ define <16 x float> @test_16xfloat_masked_shuff_mask2(<16 x float> %vec1, <16 x define <16 x float> @test_16xfloat_zero_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23> @@ -10063,16 +9251,14 @@ define <16 x float> @test_16xfloat_shuff_mask3(<16 x float> %vec1, <16 x float> define <16 x float> @test_16xfloat_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_masked_shuff_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [1:1.00] ; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_shuff_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [3:1.00] ; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -10085,15 +9271,13 @@ define <16 x float> @test_16xfloat_masked_shuff_mask3(<16 x float> %vec1, <16 x define <16 x float> @test_16xfloat_zero_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27> @@ -10118,16 +9302,14 @@ define <16 x float> @test_16xfloat_shuff_mem_mask0(<16 x float> %vec1, <16 x flo define <16 x float> @test_16xfloat_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [10:1.00] ; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -10141,15 +9323,13 @@ define <16 x float> @test_16xfloat_masked_shuff_mem_mask0(<16 x float> %vec1, <1 define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p @@ -10162,16 +9342,14 @@ define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask0(<16 x float> %vec define <16 x float> @test_16xfloat_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [10:1.00] ; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -10185,15 +9363,13 @@ define <16 x float> @test_16xfloat_masked_shuff_mem_mask1(<16 x float> %vec1, <1 define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p @@ -10206,16 +9382,14 @@ define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask1(<16 x float> %vec define <16 x float> @test_16xfloat_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [5:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [10:1.00] ; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -10229,15 +9403,13 @@ define <16 x float> @test_16xfloat_masked_shuff_mem_mask2(<16 x float> %vec1, <1 define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p @@ -10264,16 +9436,14 @@ define <16 x float> @test_16xfloat_shuff_mem_mask3(<16 x float> %vec1, <16 x flo define <16 x float> @test_16xfloat_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [10:1.00] ; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -10287,15 +9457,13 @@ define <16 x float> @test_16xfloat_masked_shuff_mem_mask3(<16 x float> %vec1, <1 define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p @@ -10321,16 +9489,14 @@ define <4 x double> @test_4xdouble_shuff_mask0(<4 x double> %vec1, <4 x double> define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_shuff_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_shuff_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00] ; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -10343,15 +9509,13 @@ define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x d define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> @@ -10362,16 +9526,14 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, < define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_shuff_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_shuff_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00] ; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -10384,15 +9546,13 @@ define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x d define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> @@ -10403,16 +9563,14 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, < define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_shuff_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_shuff_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00] ; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -10425,15 +9583,13 @@ define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x d define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> @@ -10457,16 +9613,14 @@ define <4 x double> @test_4xdouble_shuff_mask3(<4 x double> %vec1, <4 x double> define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_shuff_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_shuff_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00] ; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -10479,15 +9633,13 @@ define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x d define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> @@ -10512,16 +9664,14 @@ define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x doub define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -10535,15 +9685,13 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p @@ -10556,16 +9704,14 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -10579,15 +9725,13 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p @@ -10600,16 +9744,14 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -10623,15 +9765,13 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p @@ -10658,16 +9798,14 @@ define <4 x double> @test_4xdouble_shuff_mem_mask3(<4 x double> %vec1, <4 x doub define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -10681,15 +9819,13 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p @@ -10715,16 +9851,14 @@ define <8 x double> @test_8xdouble_shuff_mask0(<8 x double> %vec1, <8 x double> define <8 x double> @test_8xdouble_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_masked_shuff_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_shuff_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00] ; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -10737,15 +9871,13 @@ define <8 x double> @test_8xdouble_masked_shuff_mask0(<8 x double> %vec1, <8 x d define <8 x double> @test_8xdouble_zero_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9> @@ -10756,16 +9888,14 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mask0(<8 x double> %vec1, < define <8 x double> @test_8xdouble_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_masked_shuff_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_shuff_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [3:1.00] ; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -10778,15 +9908,13 @@ define <8 x double> @test_8xdouble_masked_shuff_mask1(<8 x double> %vec1, <8 x d define <8 x double> @test_8xdouble_zero_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13> @@ -10797,16 +9925,14 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mask1(<8 x double> %vec1, < define <8 x double> @test_8xdouble_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_masked_shuff_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_shuff_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [3:1.00] ; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -10819,15 +9945,13 @@ define <8 x double> @test_8xdouble_masked_shuff_mask2(<8 x double> %vec1, <8 x d define <8 x double> @test_8xdouble_zero_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 12, i32 13, i32 8, i32 9> @@ -10851,16 +9975,14 @@ define <8 x double> @test_8xdouble_shuff_mask3(<8 x double> %vec1, <8 x double> define <8 x double> @test_8xdouble_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_masked_shuff_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_shuff_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00] ; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -10873,15 +9995,13 @@ define <8 x double> @test_8xdouble_masked_shuff_mask3(<8 x double> %vec1, <8 x d define <8 x double> @test_8xdouble_zero_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11> @@ -10906,16 +10026,14 @@ define <8 x double> @test_8xdouble_shuff_mem_mask0(<8 x double> %vec1, <8 x doub define <8 x double> @test_8xdouble_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [5:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00] ; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -10929,15 +10047,13 @@ define <8 x double> @test_8xdouble_masked_shuff_mem_mask0(<8 x double> %vec1, <8 define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p @@ -10950,16 +10066,14 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask0(<8 x double> %vec define <8 x double> @test_8xdouble_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [5:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [10:1.00] ; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -10973,15 +10087,13 @@ define <8 x double> @test_8xdouble_masked_shuff_mem_mask1(<8 x double> %vec1, <8 define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p @@ -10994,16 +10106,14 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask1(<8 x double> %vec define <8 x double> @test_8xdouble_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [5:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [10:1.00] ; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -11017,15 +10127,13 @@ define <8 x double> @test_8xdouble_masked_shuff_mem_mask2(<8 x double> %vec1, <8 define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p @@ -11052,16 +10160,14 @@ define <8 x double> @test_8xdouble_shuff_mem_mask3(<8 x double> %vec1, <8 x doub define <8 x double> @test_8xdouble_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [5:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00] ; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -11075,15 +10181,13 @@ define <8 x double> @test_8xdouble_masked_shuff_mem_mask3(<8 x double> %vec1, <8 define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p @@ -11109,16 +10213,14 @@ define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) { define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_masked_shuff_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_masked_shuff_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00] ; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -11131,15 +10233,13 @@ define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2 define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> @@ -11150,16 +10250,14 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_masked_shuff_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_masked_shuff_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] ; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -11172,15 +10270,13 @@ define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2 define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> @@ -11191,16 +10287,14 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_masked_shuff_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_masked_shuff_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00] ; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -11213,15 +10307,13 @@ define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2 define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> @@ -11245,16 +10337,14 @@ define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) { define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_masked_shuff_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_masked_shuff_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] ; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -11267,15 +10357,13 @@ define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2 define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> @@ -11300,16 +10388,14 @@ define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_masked_shuff_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -11323,15 +10409,13 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p @@ -11344,16 +10428,14 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_masked_shuff_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -11367,15 +10449,13 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p @@ -11388,16 +10468,14 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_masked_shuff_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -11411,15 +10489,13 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p @@ -11446,16 +10522,14 @@ define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_masked_shuff_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -11469,15 +10543,13 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p @@ -11503,16 +10575,14 @@ define <16 x i32> @test_16xi32_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2) { define <16 x i32> @test_16xi32_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xi32_masked_shuff_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_masked_shuff_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00] ; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -11525,15 +10595,13 @@ define <16 x i32> @test_16xi32_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> % define <16 x i32> @test_16xi32_zero_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_zero_masked_shuff_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31> @@ -11544,16 +10612,14 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mask0(<16 x i32> %vec1, <16 x i define <16 x i32> @test_16xi32_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xi32_masked_shuff_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_masked_shuff_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00] ; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -11566,15 +10632,13 @@ define <16 x i32> @test_16xi32_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> % define <16 x i32> @test_16xi32_zero_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_zero_masked_shuff_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23> @@ -11585,16 +10649,14 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mask1(<16 x i32> %vec1, <16 x i define <16 x i32> @test_16xi32_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xi32_masked_shuff_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_masked_shuff_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [3:1.00] ; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -11607,15 +10669,13 @@ define <16 x i32> @test_16xi32_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> % define <16 x i32> @test_16xi32_zero_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_zero_masked_shuff_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19> @@ -11639,16 +10699,14 @@ define <16 x i32> @test_16xi32_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2) { define <16 x i32> @test_16xi32_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xi32_masked_shuff_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_masked_shuff_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00] ; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -11661,15 +10719,13 @@ define <16 x i32> @test_16xi32_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> % define <16 x i32> @test_16xi32_zero_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_zero_masked_shuff_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23> @@ -11694,16 +10750,14 @@ define <16 x i32> @test_16xi32_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %ve define <16 x i32> @test_16xi32_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_masked_shuff_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [10:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -11717,15 +10771,13 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i3 define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x i32>, <16 x i32>* %vec2p @@ -11738,16 +10790,14 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 define <16 x i32> @test_16xi32_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_masked_shuff_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [10:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -11761,15 +10811,13 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i3 define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x i32>, <16 x i32>* %vec2p @@ -11782,16 +10830,14 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 define <16 x i32> @test_16xi32_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_masked_shuff_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [10:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -11805,15 +10851,13 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i3 define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x i32>, <16 x i32>* %vec2p @@ -11840,16 +10884,14 @@ define <16 x i32> @test_16xi32_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %ve define <16 x i32> @test_16xi32_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_masked_shuff_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [10:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -11863,15 +10905,13 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i3 define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x i32>, <16 x i32>* %vec2p @@ -11897,16 +10937,14 @@ define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) { define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_masked_shuff_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_masked_shuff_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00] ; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -11919,15 +10957,13 @@ define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2 define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> @@ -11938,16 +10974,14 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_masked_shuff_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_masked_shuff_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00] ; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -11960,15 +10994,13 @@ define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2 define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> @@ -11979,16 +11011,14 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_masked_shuff_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_masked_shuff_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00] ; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -12001,15 +11031,13 @@ define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2 define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> @@ -12033,16 +11061,14 @@ define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) { define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_masked_shuff_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_masked_shuff_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00] ; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -12055,15 +11081,13 @@ define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2 define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> @@ -12088,16 +11112,14 @@ define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_masked_shuff_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -12111,15 +11133,13 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p @@ -12132,16 +11152,14 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_masked_shuff_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -12155,15 +11173,13 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p @@ -12176,16 +11192,14 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_masked_shuff_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -12199,15 +11213,13 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p @@ -12234,16 +11246,14 @@ define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_masked_shuff_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -12257,15 +11267,13 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p @@ -12291,16 +11299,14 @@ define <8 x i64> @test_8xi64_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2) { define <8 x i64> @test_8xi64_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xi64_masked_shuff_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_masked_shuff_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00] ; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -12313,15 +11319,13 @@ define <8 x i64> @test_8xi64_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2 define <8 x i64> @test_8xi64_zero_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_zero_masked_shuff_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13> @@ -12332,16 +11336,14 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> define <8 x i64> @test_8xi64_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xi64_masked_shuff_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_masked_shuff_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [3:1.00] ; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -12354,15 +11356,13 @@ define <8 x i64> @test_8xi64_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2 define <8 x i64> @test_8xi64_zero_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_zero_masked_shuff_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 12, i32 13> @@ -12373,16 +11373,14 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> define <8 x i64> @test_8xi64_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xi64_masked_shuff_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_masked_shuff_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [3:1.00] ; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -12395,15 +11393,13 @@ define <8 x i64> @test_8xi64_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2 define <8 x i64> @test_8xi64_zero_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_zero_masked_shuff_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9> @@ -12427,16 +11423,14 @@ define <8 x i64> @test_8xi64_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2) { define <8 x i64> @test_8xi64_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xi64_masked_shuff_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_masked_shuff_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00] ; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -12449,15 +11443,13 @@ define <8 x i64> @test_8xi64_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2 define <8 x i64> @test_8xi64_zero_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_zero_masked_shuff_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11> @@ -12482,16 +11474,14 @@ define <8 x i64> @test_8xi64_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p) define <8 x i64> @test_8xi64_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_masked_shuff_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -12505,15 +11495,13 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i64>, <8 x i64>* %vec2p @@ -12526,16 +11514,14 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i define <8 x i64> @test_8xi64_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_masked_shuff_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [10:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -12549,15 +11535,13 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i64>, <8 x i64>* %vec2p @@ -12570,16 +11554,14 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i define <8 x i64> @test_8xi64_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_masked_shuff_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [10:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -12593,15 +11575,13 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i64>, <8 x i64>* %vec2p @@ -12628,16 +11608,14 @@ define <8 x i64> @test_8xi64_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p) define <8 x i64> @test_8xi64_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_masked_shuff_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00] ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -12651,15 +11629,13 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i64>, <8 x i64>* %vec2p @@ -12685,16 +11661,14 @@ define <4 x float> @test_4xfloat_unpack_low_mask0(<4 x float> %vec1, <4 x float> define <4 x float> @test_4xfloat_masked_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) { ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_masked_unpack_low_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] ; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -12707,15 +11681,13 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mask0(<4 x float> %vec1, <4 x define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) { ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5> @@ -12726,16 +11698,14 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask0(<4 x float> %vec1, define <4 x float> @test_4xfloat_masked_unpack_low_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) { ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_masked_unpack_low_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] ; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -12748,15 +11718,13 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mask1(<4 x float> %vec1, <4 x define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) { ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5> @@ -12767,16 +11735,14 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask1(<4 x float> %vec1, define <4 x float> @test_4xfloat_masked_unpack_low_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) { ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_masked_unpack_low_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] ; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -12789,15 +11755,13 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mask2(<4 x float> %vec1, <4 x define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) { ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5> @@ -12821,16 +11785,14 @@ define <4 x float> @test_4xfloat_unpack_low_mask3(<4 x float> %vec1, <4 x float> define <4 x float> @test_4xfloat_masked_unpack_low_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) { ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_masked_unpack_low_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] ; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -12843,15 +11805,13 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mask3(<4 x float> %vec1, <4 x define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) { ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5> @@ -12876,16 +11836,14 @@ define <4 x float> @test_4xfloat_unpack_low_mem_mask0(<4 x float> %vec1, <4 x fl define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) { ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] ; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -12899,15 +11857,13 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask0(<4 x float> %vec1, define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) { ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x float>, <4 x float>* %vec2p @@ -12920,16 +11876,14 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask0(<4 x float> %v define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) { ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] ; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -12943,15 +11897,13 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask1(<4 x float> %vec1, define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) { ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x float>, <4 x float>* %vec2p @@ -12964,16 +11916,14 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask1(<4 x float> %v define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) { ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] ; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -12987,15 +11937,13 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask2(<4 x float> %vec1, define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) { ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x float>, <4 x float>* %vec2p @@ -13022,16 +11970,14 @@ define <4 x float> @test_4xfloat_unpack_low_mem_mask3(<4 x float> %vec1, <4 x fl define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) { ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] ; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -13045,15 +11991,13 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask3(<4 x float> %vec1, define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) { ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x float>, <4 x float>* %vec2p @@ -13079,16 +12023,14 @@ define <8 x float> @test_8xfloat_unpack_low_mask0(<8 x float> %vec1, <8 x float> define <8 x float> @test_8xfloat_masked_unpack_low_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_masked_unpack_low_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] ; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -13101,15 +12043,13 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mask0(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> @@ -13120,16 +12060,14 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask0(<8 x float> %vec1, define <8 x float> @test_8xfloat_masked_unpack_low_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_masked_unpack_low_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] ; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -13142,15 +12080,13 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mask1(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> @@ -13161,16 +12097,14 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask1(<8 x float> %vec1, define <8 x float> @test_8xfloat_masked_unpack_low_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_masked_unpack_low_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] ; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -13183,15 +12117,13 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mask2(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> @@ -13215,16 +12147,14 @@ define <8 x float> @test_8xfloat_unpack_low_mask3(<8 x float> %vec1, <8 x float> define <8 x float> @test_8xfloat_masked_unpack_low_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_masked_unpack_low_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] ; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -13237,15 +12167,13 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mask3(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> @@ -13270,16 +12198,14 @@ define <8 x float> @test_8xfloat_unpack_low_mem_mask0(<8 x float> %vec1, <8 x fl define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00] ; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -13293,15 +12219,13 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask0(<8 x float> %vec1, define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p @@ -13314,16 +12238,14 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask0(<8 x float> %v define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00] ; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -13337,15 +12259,13 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask1(<8 x float> %vec1, define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p @@ -13358,16 +12278,14 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask1(<8 x float> %v define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00] ; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -13381,15 +12299,13 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask2(<8 x float> %vec1, define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p @@ -13416,16 +12332,14 @@ define <8 x float> @test_8xfloat_unpack_low_mem_mask3(<8 x float> %vec1, <8 x fl define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00] ; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -13439,15 +12353,13 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask3(<8 x float> %vec1, define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p @@ -13473,16 +12385,14 @@ define <16 x float> @test_16xfloat_unpack_low_mask0(<16 x float> %vec1, <16 x fl define <16 x float> @test_16xfloat_masked_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_unpack_low_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00] ; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -13495,15 +12405,13 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mask0(<16 x float> %vec1, < define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29> @@ -13514,16 +12422,14 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask0(<16 x float> %ve define <16 x float> @test_16xfloat_masked_unpack_low_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_unpack_low_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00] ; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -13536,15 +12442,13 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mask1(<16 x float> %vec1, < define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29> @@ -13555,16 +12459,14 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask1(<16 x float> %ve define <16 x float> @test_16xfloat_masked_unpack_low_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_unpack_low_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00] ; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -13577,15 +12479,13 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mask2(<16 x float> %vec1, < define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29> @@ -13609,16 +12509,14 @@ define <16 x float> @test_16xfloat_unpack_low_mask3(<16 x float> %vec1, <16 x fl define <16 x float> @test_16xfloat_masked_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_unpack_low_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00] ; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -13631,15 +12529,13 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mask3(<16 x float> %vec1, < define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29> @@ -13664,16 +12560,14 @@ define <16 x float> @test_16xfloat_unpack_low_mem_mask0(<16 x float> %vec1, <16 define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00] ; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -13687,15 +12581,13 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask0(<16 x float> %vec define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p @@ -13708,16 +12600,14 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask0(<16 x float> define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00] ; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -13731,15 +12621,13 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask1(<16 x float> %vec define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p @@ -13752,16 +12640,14 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask1(<16 x float> define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00] ; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -13775,15 +12661,13 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask2(<16 x float> %vec define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p @@ -13810,16 +12694,14 @@ define <16 x float> @test_16xfloat_unpack_low_mem_mask3(<16 x float> %vec1, <16 define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00] ; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -13833,15 +12715,13 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask3(<16 x float> %vec define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p @@ -13867,16 +12747,14 @@ define <2 x double> @test_2xdouble_unpack_low_mask0(<2 x double> %vec1, <2 x dou define <2 x double> @test_2xdouble_masked_unpack_low_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) { ; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %xmm3, %xmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_2xdouble_masked_unpack_low_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %xmm3, %xmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00] ; SKX-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -13889,15 +12767,13 @@ define <2 x double> @test_2xdouble_masked_unpack_low_mask0(<2 x double> %vec1, < define <2 x double> @test_2xdouble_zero_masked_unpack_low_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) { ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2> @@ -13908,16 +12784,14 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_low_mask0(<2 x double> %ve define <2 x double> @test_2xdouble_masked_unpack_low_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) { ; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %xmm3, %xmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_2xdouble_masked_unpack_low_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %xmm3, %xmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00] ; SKX-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -13930,15 +12804,13 @@ define <2 x double> @test_2xdouble_masked_unpack_low_mask1(<2 x double> %vec1, < define <2 x double> @test_2xdouble_zero_masked_unpack_low_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) { ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2> @@ -13963,16 +12835,14 @@ define <2 x double> @test_2xdouble_unpack_low_mem_mask0(<2 x double> %vec1, <2 x define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) { ; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_2xdouble_masked_unpack_low_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00] ; SKX-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -13986,15 +12856,13 @@ define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask0(<2 x double> %vec define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) { ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <2 x double>, <2 x double>* %vec2p @@ -14007,16 +12875,14 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask0(<2 x double> define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) { ; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_2xdouble_masked_unpack_low_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00] ; SKX-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -14030,15 +12896,13 @@ define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask1(<2 x double> %vec define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) { ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <2 x double>, <2 x double>* %vec2p @@ -14064,16 +12928,14 @@ define <4 x double> @test_4xdouble_unpack_low_mask0(<4 x double> %vec1, <4 x dou define <4 x double> @test_4xdouble_masked_unpack_low_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_unpack_low_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] ; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -14086,15 +12948,13 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mask0(<4 x double> %vec1, < define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6> @@ -14105,16 +12965,14 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask0(<4 x double> %ve define <4 x double> @test_4xdouble_masked_unpack_low_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_unpack_low_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] ; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -14127,15 +12985,13 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mask1(<4 x double> %vec1, < define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6> @@ -14146,16 +13002,14 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask1(<4 x double> %ve define <4 x double> @test_4xdouble_masked_unpack_low_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_unpack_low_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] ; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -14168,15 +13022,13 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mask2(<4 x double> %vec1, < define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6> @@ -14200,16 +13052,14 @@ define <4 x double> @test_4xdouble_unpack_low_mask3(<4 x double> %vec1, <4 x dou define <4 x double> @test_4xdouble_masked_unpack_low_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_unpack_low_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] ; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -14222,15 +13072,13 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mask3(<4 x double> %vec1, < define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6> @@ -14255,16 +13103,14 @@ define <4 x double> @test_4xdouble_unpack_low_mem_mask0(<4 x double> %vec1, <4 x define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00] ; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -14278,15 +13124,13 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask0(<4 x double> %vec define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p @@ -14299,16 +13143,14 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask0(<4 x double> define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00] ; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -14322,15 +13164,13 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask1(<4 x double> %vec define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p @@ -14343,16 +13183,14 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask1(<4 x double> define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00] ; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -14366,15 +13204,13 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask2(<4 x double> %vec define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p @@ -14401,16 +13237,14 @@ define <4 x double> @test_4xdouble_unpack_low_mem_mask3(<4 x double> %vec1, <4 x define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00] ; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -14424,15 +13258,13 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask3(<4 x double> %vec define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p @@ -14458,16 +13290,14 @@ define <8 x double> @test_8xdouble_unpack_low_mask0(<8 x double> %vec1, <8 x dou define <8 x double> @test_8xdouble_masked_unpack_low_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_unpack_low_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00] ; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -14480,15 +13310,13 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mask0(<8 x double> %vec1, < define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> @@ -14499,16 +13327,14 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask0(<8 x double> %ve define <8 x double> @test_8xdouble_masked_unpack_low_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_unpack_low_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00] ; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -14521,15 +13347,13 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mask1(<8 x double> %vec1, < define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> @@ -14540,16 +13364,14 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask1(<8 x double> %ve define <8 x double> @test_8xdouble_masked_unpack_low_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_unpack_low_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00] ; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -14562,15 +13384,13 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mask2(<8 x double> %vec1, < define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> @@ -14594,16 +13414,14 @@ define <8 x double> @test_8xdouble_unpack_low_mask3(<8 x double> %vec1, <8 x dou define <8 x double> @test_8xdouble_masked_unpack_low_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_unpack_low_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00] ; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -14616,15 +13434,13 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mask3(<8 x double> %vec1, < define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> @@ -14649,16 +13465,14 @@ define <8 x double> @test_8xdouble_unpack_low_mem_mask0(<8 x double> %vec1, <8 x define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00] ; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -14672,15 +13486,13 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask0(<8 x double> %vec define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p @@ -14693,16 +13505,14 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask0(<8 x double> define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00] ; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -14716,15 +13526,13 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask1(<8 x double> %vec define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p @@ -14737,16 +13545,14 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask1(<8 x double> define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00] ; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -14760,15 +13566,13 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask2(<8 x double> %vec define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p @@ -14795,16 +13599,14 @@ define <8 x double> @test_8xdouble_unpack_low_mem_mask3(<8 x double> %vec1, <8 x define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00] ; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -14818,15 +13620,13 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask3(<8 x double> %vec define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p @@ -14852,16 +13652,14 @@ define <4 x float> @test_4xfloat_unpack_high_mask0(<4 x float> %vec1, <4 x float define <4 x float> @test_4xfloat_masked_unpack_high_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) { ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_masked_unpack_high_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] ; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -14874,15 +13672,13 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mask0(<4 x float> %vec1, <4 define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) { ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7> @@ -14893,16 +13689,14 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask0(<4 x float> %vec1 define <4 x float> @test_4xfloat_masked_unpack_high_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) { ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_masked_unpack_high_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] ; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -14915,15 +13709,13 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mask1(<4 x float> %vec1, <4 define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) { ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7> @@ -14934,16 +13726,14 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask1(<4 x float> %vec1 define <4 x float> @test_4xfloat_masked_unpack_high_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) { ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_masked_unpack_high_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] ; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -14956,15 +13746,13 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mask2(<4 x float> %vec1, <4 define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) { ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7> @@ -14988,16 +13776,14 @@ define <4 x float> @test_4xfloat_unpack_high_mask3(<4 x float> %vec1, <4 x float define <4 x float> @test_4xfloat_masked_unpack_high_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) { ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_masked_unpack_high_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] ; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -15010,15 +13796,13 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mask3(<4 x float> %vec1, <4 define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) { ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7> @@ -15043,16 +13827,14 @@ define <4 x float> @test_4xfloat_unpack_high_mem_mask0(<4 x float> %vec1, <4 x f define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) { ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] ; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -15066,15 +13848,13 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask0(<4 x float> %vec1, define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) { ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x float>, <4 x float>* %vec2p @@ -15087,16 +13867,14 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask0(<4 x float> % define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) { ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] ; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -15110,15 +13888,13 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask1(<4 x float> %vec1, define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) { ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x float>, <4 x float>* %vec2p @@ -15131,16 +13907,14 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask1(<4 x float> % define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) { ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] ; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -15154,15 +13928,13 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask2(<4 x float> %vec1, define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) { ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x float>, <4 x float>* %vec2p @@ -15189,16 +13961,14 @@ define <4 x float> @test_4xfloat_unpack_high_mem_mask3(<4 x float> %vec1, <4 x f define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) { ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] ; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -15212,15 +13982,13 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask3(<4 x float> %vec1, define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) { ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x float>, <4 x float>* %vec2p @@ -15246,16 +14014,14 @@ define <8 x float> @test_8xfloat_unpack_high_mask0(<8 x float> %vec1, <8 x float define <8 x float> @test_8xfloat_masked_unpack_high_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_masked_unpack_high_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] ; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -15268,15 +14034,13 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mask0(<8 x float> %vec1, <8 define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> @@ -15287,16 +14051,14 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask0(<8 x float> %vec1 define <8 x float> @test_8xfloat_masked_unpack_high_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_masked_unpack_high_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] ; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -15309,15 +14071,13 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mask1(<8 x float> %vec1, <8 define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> @@ -15328,16 +14088,14 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask1(<8 x float> %vec1 define <8 x float> @test_8xfloat_masked_unpack_high_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_masked_unpack_high_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] ; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -15350,15 +14108,13 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mask2(<8 x float> %vec1, <8 define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> @@ -15382,16 +14138,14 @@ define <8 x float> @test_8xfloat_unpack_high_mask3(<8 x float> %vec1, <8 x float define <8 x float> @test_8xfloat_masked_unpack_high_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_masked_unpack_high_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] ; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -15404,15 +14158,13 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mask3(<8 x float> %vec1, <8 define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> @@ -15437,16 +14189,14 @@ define <8 x float> @test_8xfloat_unpack_high_mem_mask0(<8 x float> %vec1, <8 x f define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00] ; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -15460,15 +14210,13 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask0(<8 x float> %vec1, define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p @@ -15481,16 +14229,14 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask0(<8 x float> % define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00] ; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -15504,15 +14250,13 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask1(<8 x float> %vec1, define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p @@ -15525,16 +14269,14 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask1(<8 x float> % define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00] ; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -15548,15 +14290,13 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask2(<8 x float> %vec1, define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p @@ -15583,16 +14323,14 @@ define <8 x float> @test_8xfloat_unpack_high_mem_mask3(<8 x float> %vec1, <8 x f define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00] ; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -15606,15 +14344,13 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask3(<8 x float> %vec1, define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p @@ -15640,16 +14376,14 @@ define <16 x float> @test_16xfloat_unpack_high_mask0(<16 x float> %vec1, <16 x f define <16 x float> @test_16xfloat_masked_unpack_high_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_unpack_high_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00] ; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -15662,15 +14396,13 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mask0(<16 x float> %vec1, define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31> @@ -15681,16 +14413,14 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask0(<16 x float> %v define <16 x float> @test_16xfloat_masked_unpack_high_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_unpack_high_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00] ; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -15703,15 +14433,13 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mask1(<16 x float> %vec1, define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31> @@ -15722,16 +14450,14 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask1(<16 x float> %v define <16 x float> @test_16xfloat_masked_unpack_high_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_unpack_high_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00] ; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -15744,15 +14470,13 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mask2(<16 x float> %vec1, define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31> @@ -15776,16 +14500,14 @@ define <16 x float> @test_16xfloat_unpack_high_mask3(<16 x float> %vec1, <16 x f define <16 x float> @test_16xfloat_masked_unpack_high_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_unpack_high_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00] ; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -15798,15 +14520,13 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mask3(<16 x float> %vec1, define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31> @@ -15831,16 +14551,14 @@ define <16 x float> @test_16xfloat_unpack_high_mem_mask0(<16 x float> %vec1, <16 define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00] ; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -15854,15 +14572,13 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask0(<16 x float> %ve define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p @@ -15875,16 +14591,14 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask0(<16 x float define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00] ; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -15898,15 +14612,13 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask1(<16 x float> %ve define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p @@ -15919,16 +14631,14 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask1(<16 x float define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00] ; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -15942,15 +14652,13 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask2(<16 x float> %ve define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p @@ -15977,16 +14685,14 @@ define <16 x float> @test_16xfloat_unpack_high_mem_mask3(<16 x float> %vec1, <16 define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00] ; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -16000,15 +14706,13 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask3(<16 x float> %ve define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p @@ -16034,16 +14738,14 @@ define <2 x double> @test_2xdouble_unpack_high_mask0(<2 x double> %vec1, <2 x do define <2 x double> @test_2xdouble_masked_unpack_high_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) { ; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %xmm3, %xmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_2xdouble_masked_unpack_high_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %xmm3, %xmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00] ; SKX-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -16056,15 +14758,13 @@ define <2 x double> @test_2xdouble_masked_unpack_high_mask0(<2 x double> %vec1, define <2 x double> @test_2xdouble_zero_masked_unpack_high_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) { ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3> @@ -16075,16 +14775,14 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_high_mask0(<2 x double> %v define <2 x double> @test_2xdouble_masked_unpack_high_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) { ; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %xmm3, %xmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_2xdouble_masked_unpack_high_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %xmm3, %xmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00] ; SKX-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -16097,15 +14795,13 @@ define <2 x double> @test_2xdouble_masked_unpack_high_mask1(<2 x double> %vec1, define <2 x double> @test_2xdouble_zero_masked_unpack_high_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) { ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3> @@ -16130,16 +14826,14 @@ define <2 x double> @test_2xdouble_unpack_high_mem_mask0(<2 x double> %vec1, <2 define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) { ; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_2xdouble_masked_unpack_high_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00] ; SKX-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -16153,15 +14847,13 @@ define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask0(<2 x double> %ve define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) { ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <2 x double>, <2 x double>* %vec2p @@ -16174,16 +14866,14 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask0(<2 x double define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) { ; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_2xdouble_masked_unpack_high_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00] ; SKX-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -16197,15 +14887,13 @@ define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask1(<2 x double> %ve define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) { ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <2 x double>, <2 x double>* %vec2p @@ -16231,16 +14919,14 @@ define <4 x double> @test_4xdouble_unpack_high_mask0(<4 x double> %vec1, <4 x do define <4 x double> @test_4xdouble_masked_unpack_high_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_unpack_high_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] ; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -16253,15 +14939,13 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mask0(<4 x double> %vec1, define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7> @@ -16272,16 +14956,14 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask0(<4 x double> %v define <4 x double> @test_4xdouble_masked_unpack_high_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_unpack_high_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] ; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -16294,15 +14976,13 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mask1(<4 x double> %vec1, define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7> @@ -16313,16 +14993,14 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask1(<4 x double> %v define <4 x double> @test_4xdouble_masked_unpack_high_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_unpack_high_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] ; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -16335,15 +15013,13 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mask2(<4 x double> %vec1, define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7> @@ -16367,16 +15043,14 @@ define <4 x double> @test_4xdouble_unpack_high_mask3(<4 x double> %vec1, <4 x do define <4 x double> @test_4xdouble_masked_unpack_high_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_unpack_high_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] ; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -16389,15 +15063,13 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mask3(<4 x double> %vec1, define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7> @@ -16422,16 +15094,14 @@ define <4 x double> @test_4xdouble_unpack_high_mem_mask0(<4 x double> %vec1, <4 define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00] ; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -16445,15 +15115,13 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask0(<4 x double> %ve define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p @@ -16466,16 +15134,14 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask0(<4 x double define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00] ; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -16489,15 +15155,13 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask1(<4 x double> %ve define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p @@ -16510,16 +15174,14 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask1(<4 x double define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00] ; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -16533,15 +15195,13 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask2(<4 x double> %ve define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p @@ -16568,16 +15228,14 @@ define <4 x double> @test_4xdouble_unpack_high_mem_mask3(<4 x double> %vec1, <4 define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00] ; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -16591,15 +15249,13 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask3(<4 x double> %ve define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p @@ -16625,16 +15281,14 @@ define <8 x double> @test_8xdouble_unpack_high_mask0(<8 x double> %vec1, <8 x do define <8 x double> @test_8xdouble_masked_unpack_high_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_unpack_high_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00] ; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -16647,15 +15301,13 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mask0(<8 x double> %vec1, define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> @@ -16666,16 +15318,14 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask0(<8 x double> %v define <8 x double> @test_8xdouble_masked_unpack_high_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_unpack_high_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00] ; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -16688,15 +15338,13 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mask1(<8 x double> %vec1, define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> @@ -16707,16 +15355,14 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask1(<8 x double> %v define <8 x double> @test_8xdouble_masked_unpack_high_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_unpack_high_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00] ; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -16729,15 +15375,13 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mask2(<8 x double> %vec1, define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> @@ -16761,16 +15405,14 @@ define <8 x double> @test_8xdouble_unpack_high_mask3(<8 x double> %vec1, <8 x do define <8 x double> @test_8xdouble_masked_unpack_high_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_unpack_high_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00] ; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -16783,15 +15425,13 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mask3(<8 x double> %vec1, define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> @@ -16816,16 +15456,14 @@ define <8 x double> @test_8xdouble_unpack_high_mem_mask0(<8 x double> %vec1, <8 define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00] ; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -16839,15 +15477,13 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask0(<8 x double> %ve define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask0: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p @@ -16860,16 +15496,14 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask0(<8 x double define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00] ; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -16883,15 +15517,13 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask1(<8 x double> %ve define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask1: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p @@ -16904,16 +15536,14 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask1(<8 x double define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00] ; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -16927,15 +15557,13 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask2(<8 x double> %ve define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask2: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p @@ -16962,16 +15590,14 @@ define <8 x double> @test_8xdouble_unpack_high_mem_mask3(<8 x double> %vec1, <8 define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00] ; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -16985,15 +15611,13 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask3(<8 x double> %ve define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) { ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask3: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-int.ll b/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-int.ll index b31302d51ff..74005debfed 100644 --- a/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-int.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-int.ll @@ -13,8 +13,7 @@ define <16 x i8> @test_i8_to_16(i8 %s) { define <16 x i8> @test_masked_i8_to_16_mask0(i8 %s, <16 x i8> %default, <16 x i8> %mask) { ; CHECK-LABEL: test_masked_i8_to_16_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmb %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastb %edi, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i8> undef, i8 %s, i32 0 @@ -27,8 +26,7 @@ define <16 x i8> @test_masked_i8_to_16_mask0(i8 %s, <16 x i8> %default, <16 x i8 define <16 x i8> @test_masked_z_i8_to_16_mask0(i8 %s, <16 x i8> %mask) { ; CHECK-LABEL: test_masked_z_i8_to_16_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmb %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpbroadcastb %edi, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i8> undef, i8 %s, i32 0 @@ -40,8 +38,7 @@ define <16 x i8> @test_masked_z_i8_to_16_mask0(i8 %s, <16 x i8> %mask) { define <16 x i8> @test_masked_i8_to_16_mask1(i8 %s, <16 x i8> %default, <16 x i8> %mask) { ; CHECK-LABEL: test_masked_i8_to_16_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmb %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastb %edi, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i8> undef, i8 %s, i32 0 @@ -54,8 +51,7 @@ define <16 x i8> @test_masked_i8_to_16_mask1(i8 %s, <16 x i8> %default, <16 x i8 define <16 x i8> @test_masked_z_i8_to_16_mask1(i8 %s, <16 x i8> %mask) { ; CHECK-LABEL: test_masked_z_i8_to_16_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmb %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpbroadcastb %edi, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i8> undef, i8 %s, i32 0 @@ -67,8 +63,7 @@ define <16 x i8> @test_masked_z_i8_to_16_mask1(i8 %s, <16 x i8> %mask) { define <16 x i8> @test_masked_i8_to_16_mask2(i8 %s, <16 x i8> %default, <16 x i8> %mask) { ; CHECK-LABEL: test_masked_i8_to_16_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmb %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastb %edi, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i8> undef, i8 %s, i32 0 @@ -81,8 +76,7 @@ define <16 x i8> @test_masked_i8_to_16_mask2(i8 %s, <16 x i8> %default, <16 x i8 define <16 x i8> @test_masked_z_i8_to_16_mask2(i8 %s, <16 x i8> %mask) { ; CHECK-LABEL: test_masked_z_i8_to_16_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmb %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpbroadcastb %edi, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i8> undef, i8 %s, i32 0 @@ -94,8 +88,7 @@ define <16 x i8> @test_masked_z_i8_to_16_mask2(i8 %s, <16 x i8> %mask) { define <16 x i8> @test_masked_i8_to_16_mask3(i8 %s, <16 x i8> %default, <16 x i8> %mask) { ; CHECK-LABEL: test_masked_i8_to_16_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmb %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastb %edi, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i8> undef, i8 %s, i32 0 @@ -108,8 +101,7 @@ define <16 x i8> @test_masked_i8_to_16_mask3(i8 %s, <16 x i8> %default, <16 x i8 define <16 x i8> @test_masked_z_i8_to_16_mask3(i8 %s, <16 x i8> %mask) { ; CHECK-LABEL: test_masked_z_i8_to_16_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmb %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpbroadcastb %edi, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i8> undef, i8 %s, i32 0 @@ -130,8 +122,7 @@ define <32 x i8> @test_i8_to_32(i8 %s) { define <32 x i8> @test_masked_i8_to_32_mask0(i8 %s, <32 x i8> %default, <32 x i8> %mask) { ; CHECK-LABEL: test_masked_i8_to_32_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastb %edi, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i8> undef, i8 %s, i32 0 @@ -144,8 +135,7 @@ define <32 x i8> @test_masked_i8_to_32_mask0(i8 %s, <32 x i8> %default, <32 x i8 define <32 x i8> @test_masked_z_i8_to_32_mask0(i8 %s, <32 x i8> %mask) { ; CHECK-LABEL: test_masked_z_i8_to_32_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpbroadcastb %edi, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i8> undef, i8 %s, i32 0 @@ -157,8 +147,7 @@ define <32 x i8> @test_masked_z_i8_to_32_mask0(i8 %s, <32 x i8> %mask) { define <32 x i8> @test_masked_i8_to_32_mask1(i8 %s, <32 x i8> %default, <32 x i8> %mask) { ; CHECK-LABEL: test_masked_i8_to_32_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastb %edi, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i8> undef, i8 %s, i32 0 @@ -171,8 +160,7 @@ define <32 x i8> @test_masked_i8_to_32_mask1(i8 %s, <32 x i8> %default, <32 x i8 define <32 x i8> @test_masked_z_i8_to_32_mask1(i8 %s, <32 x i8> %mask) { ; CHECK-LABEL: test_masked_z_i8_to_32_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpbroadcastb %edi, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i8> undef, i8 %s, i32 0 @@ -184,8 +172,7 @@ define <32 x i8> @test_masked_z_i8_to_32_mask1(i8 %s, <32 x i8> %mask) { define <32 x i8> @test_masked_i8_to_32_mask2(i8 %s, <32 x i8> %default, <32 x i8> %mask) { ; CHECK-LABEL: test_masked_i8_to_32_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastb %edi, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i8> undef, i8 %s, i32 0 @@ -198,8 +185,7 @@ define <32 x i8> @test_masked_i8_to_32_mask2(i8 %s, <32 x i8> %default, <32 x i8 define <32 x i8> @test_masked_z_i8_to_32_mask2(i8 %s, <32 x i8> %mask) { ; CHECK-LABEL: test_masked_z_i8_to_32_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpbroadcastb %edi, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i8> undef, i8 %s, i32 0 @@ -211,8 +197,7 @@ define <32 x i8> @test_masked_z_i8_to_32_mask2(i8 %s, <32 x i8> %mask) { define <32 x i8> @test_masked_i8_to_32_mask3(i8 %s, <32 x i8> %default, <32 x i8> %mask) { ; CHECK-LABEL: test_masked_i8_to_32_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastb %edi, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i8> undef, i8 %s, i32 0 @@ -225,8 +210,7 @@ define <32 x i8> @test_masked_i8_to_32_mask3(i8 %s, <32 x i8> %default, <32 x i8 define <32 x i8> @test_masked_z_i8_to_32_mask3(i8 %s, <32 x i8> %mask) { ; CHECK-LABEL: test_masked_z_i8_to_32_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpbroadcastb %edi, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i8> undef, i8 %s, i32 0 @@ -247,8 +231,7 @@ define <64 x i8> @test_i8_to_64(i8 %s) { define <64 x i8> @test_masked_i8_to_64_mask0(i8 %s, <64 x i8> %default, <64 x i8> %mask) { ; CHECK-LABEL: test_masked_i8_to_64_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmb %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastb %edi, %zmm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i8> undef, i8 %s, i32 0 @@ -261,8 +244,7 @@ define <64 x i8> @test_masked_i8_to_64_mask0(i8 %s, <64 x i8> %default, <64 x i8 define <64 x i8> @test_masked_z_i8_to_64_mask0(i8 %s, <64 x i8> %mask) { ; CHECK-LABEL: test_masked_z_i8_to_64_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmb %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpbroadcastb %edi, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i8> undef, i8 %s, i32 0 @@ -274,8 +256,7 @@ define <64 x i8> @test_masked_z_i8_to_64_mask0(i8 %s, <64 x i8> %mask) { define <64 x i8> @test_masked_i8_to_64_mask1(i8 %s, <64 x i8> %default, <64 x i8> %mask) { ; CHECK-LABEL: test_masked_i8_to_64_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmb %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastb %edi, %zmm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i8> undef, i8 %s, i32 0 @@ -288,8 +269,7 @@ define <64 x i8> @test_masked_i8_to_64_mask1(i8 %s, <64 x i8> %default, <64 x i8 define <64 x i8> @test_masked_z_i8_to_64_mask1(i8 %s, <64 x i8> %mask) { ; CHECK-LABEL: test_masked_z_i8_to_64_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmb %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpbroadcastb %edi, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i8> undef, i8 %s, i32 0 @@ -301,8 +281,7 @@ define <64 x i8> @test_masked_z_i8_to_64_mask1(i8 %s, <64 x i8> %mask) { define <64 x i8> @test_masked_i8_to_64_mask2(i8 %s, <64 x i8> %default, <64 x i8> %mask) { ; CHECK-LABEL: test_masked_i8_to_64_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmb %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastb %edi, %zmm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i8> undef, i8 %s, i32 0 @@ -315,8 +294,7 @@ define <64 x i8> @test_masked_i8_to_64_mask2(i8 %s, <64 x i8> %default, <64 x i8 define <64 x i8> @test_masked_z_i8_to_64_mask2(i8 %s, <64 x i8> %mask) { ; CHECK-LABEL: test_masked_z_i8_to_64_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmb %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpbroadcastb %edi, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i8> undef, i8 %s, i32 0 @@ -328,8 +306,7 @@ define <64 x i8> @test_masked_z_i8_to_64_mask2(i8 %s, <64 x i8> %mask) { define <64 x i8> @test_masked_i8_to_64_mask3(i8 %s, <64 x i8> %default, <64 x i8> %mask) { ; CHECK-LABEL: test_masked_i8_to_64_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmb %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastb %edi, %zmm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i8> undef, i8 %s, i32 0 @@ -342,8 +319,7 @@ define <64 x i8> @test_masked_i8_to_64_mask3(i8 %s, <64 x i8> %default, <64 x i8 define <64 x i8> @test_masked_z_i8_to_64_mask3(i8 %s, <64 x i8> %mask) { ; CHECK-LABEL: test_masked_z_i8_to_64_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmb %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpbroadcastb %edi, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i8> undef, i8 %s, i32 0 @@ -364,8 +340,7 @@ define <8 x i16> @test_i16_to_8(i16 %s) { define <8 x i16> @test_masked_i16_to_8_mask0(i16 %s, <8 x i16> %default, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_i16_to_8_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastw %edi, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i16> undef, i16 %s, i32 0 @@ -378,8 +353,7 @@ define <8 x i16> @test_masked_i16_to_8_mask0(i16 %s, <8 x i16> %default, <8 x i1 define <8 x i16> @test_masked_z_i16_to_8_mask0(i16 %s, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_i16_to_8_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpbroadcastw %edi, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i16> undef, i16 %s, i32 0 @@ -391,8 +365,7 @@ define <8 x i16> @test_masked_z_i16_to_8_mask0(i16 %s, <8 x i16> %mask) { define <8 x i16> @test_masked_i16_to_8_mask1(i16 %s, <8 x i16> %default, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_i16_to_8_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastw %edi, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i16> undef, i16 %s, i32 0 @@ -405,8 +378,7 @@ define <8 x i16> @test_masked_i16_to_8_mask1(i16 %s, <8 x i16> %default, <8 x i1 define <8 x i16> @test_masked_z_i16_to_8_mask1(i16 %s, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_i16_to_8_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpbroadcastw %edi, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i16> undef, i16 %s, i32 0 @@ -418,8 +390,7 @@ define <8 x i16> @test_masked_z_i16_to_8_mask1(i16 %s, <8 x i16> %mask) { define <8 x i16> @test_masked_i16_to_8_mask2(i16 %s, <8 x i16> %default, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_i16_to_8_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastw %edi, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i16> undef, i16 %s, i32 0 @@ -432,8 +403,7 @@ define <8 x i16> @test_masked_i16_to_8_mask2(i16 %s, <8 x i16> %default, <8 x i1 define <8 x i16> @test_masked_z_i16_to_8_mask2(i16 %s, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_i16_to_8_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpbroadcastw %edi, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i16> undef, i16 %s, i32 0 @@ -445,8 +415,7 @@ define <8 x i16> @test_masked_z_i16_to_8_mask2(i16 %s, <8 x i16> %mask) { define <8 x i16> @test_masked_i16_to_8_mask3(i16 %s, <8 x i16> %default, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_i16_to_8_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastw %edi, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i16> undef, i16 %s, i32 0 @@ -459,8 +428,7 @@ define <8 x i16> @test_masked_i16_to_8_mask3(i16 %s, <8 x i16> %default, <8 x i1 define <8 x i16> @test_masked_z_i16_to_8_mask3(i16 %s, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_i16_to_8_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpbroadcastw %edi, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i16> undef, i16 %s, i32 0 @@ -481,8 +449,7 @@ define <16 x i16> @test_i16_to_16(i16 %s) { define <16 x i16> @test_masked_i16_to_16_mask0(i16 %s, <16 x i16> %default, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_i16_to_16_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastw %edi, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i16> undef, i16 %s, i32 0 @@ -495,8 +462,7 @@ define <16 x i16> @test_masked_i16_to_16_mask0(i16 %s, <16 x i16> %default, <16 define <16 x i16> @test_masked_z_i16_to_16_mask0(i16 %s, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_i16_to_16_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpbroadcastw %edi, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i16> undef, i16 %s, i32 0 @@ -508,8 +474,7 @@ define <16 x i16> @test_masked_z_i16_to_16_mask0(i16 %s, <16 x i16> %mask) { define <16 x i16> @test_masked_i16_to_16_mask1(i16 %s, <16 x i16> %default, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_i16_to_16_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastw %edi, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i16> undef, i16 %s, i32 0 @@ -522,8 +487,7 @@ define <16 x i16> @test_masked_i16_to_16_mask1(i16 %s, <16 x i16> %default, <16 define <16 x i16> @test_masked_z_i16_to_16_mask1(i16 %s, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_i16_to_16_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpbroadcastw %edi, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i16> undef, i16 %s, i32 0 @@ -535,8 +499,7 @@ define <16 x i16> @test_masked_z_i16_to_16_mask1(i16 %s, <16 x i16> %mask) { define <16 x i16> @test_masked_i16_to_16_mask2(i16 %s, <16 x i16> %default, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_i16_to_16_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastw %edi, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i16> undef, i16 %s, i32 0 @@ -549,8 +512,7 @@ define <16 x i16> @test_masked_i16_to_16_mask2(i16 %s, <16 x i16> %default, <16 define <16 x i16> @test_masked_z_i16_to_16_mask2(i16 %s, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_i16_to_16_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpbroadcastw %edi, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i16> undef, i16 %s, i32 0 @@ -562,8 +524,7 @@ define <16 x i16> @test_masked_z_i16_to_16_mask2(i16 %s, <16 x i16> %mask) { define <16 x i16> @test_masked_i16_to_16_mask3(i16 %s, <16 x i16> %default, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_i16_to_16_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastw %edi, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i16> undef, i16 %s, i32 0 @@ -576,8 +537,7 @@ define <16 x i16> @test_masked_i16_to_16_mask3(i16 %s, <16 x i16> %default, <16 define <16 x i16> @test_masked_z_i16_to_16_mask3(i16 %s, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_i16_to_16_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpbroadcastw %edi, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i16> undef, i16 %s, i32 0 @@ -598,8 +558,7 @@ define <32 x i16> @test_i16_to_32(i16 %s) { define <32 x i16> @test_masked_i16_to_32_mask0(i16 %s, <32 x i16> %default, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_i16_to_32_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastw %edi, %zmm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i16> undef, i16 %s, i32 0 @@ -612,8 +571,7 @@ define <32 x i16> @test_masked_i16_to_32_mask0(i16 %s, <32 x i16> %default, <32 define <32 x i16> @test_masked_z_i16_to_32_mask0(i16 %s, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_i16_to_32_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpbroadcastw %edi, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i16> undef, i16 %s, i32 0 @@ -625,8 +583,7 @@ define <32 x i16> @test_masked_z_i16_to_32_mask0(i16 %s, <32 x i16> %mask) { define <32 x i16> @test_masked_i16_to_32_mask1(i16 %s, <32 x i16> %default, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_i16_to_32_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastw %edi, %zmm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i16> undef, i16 %s, i32 0 @@ -639,8 +596,7 @@ define <32 x i16> @test_masked_i16_to_32_mask1(i16 %s, <32 x i16> %default, <32 define <32 x i16> @test_masked_z_i16_to_32_mask1(i16 %s, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_i16_to_32_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpbroadcastw %edi, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i16> undef, i16 %s, i32 0 @@ -652,8 +608,7 @@ define <32 x i16> @test_masked_z_i16_to_32_mask1(i16 %s, <32 x i16> %mask) { define <32 x i16> @test_masked_i16_to_32_mask2(i16 %s, <32 x i16> %default, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_i16_to_32_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastw %edi, %zmm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i16> undef, i16 %s, i32 0 @@ -666,8 +621,7 @@ define <32 x i16> @test_masked_i16_to_32_mask2(i16 %s, <32 x i16> %default, <32 define <32 x i16> @test_masked_z_i16_to_32_mask2(i16 %s, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_i16_to_32_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpbroadcastw %edi, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i16> undef, i16 %s, i32 0 @@ -679,8 +633,7 @@ define <32 x i16> @test_masked_z_i16_to_32_mask2(i16 %s, <32 x i16> %mask) { define <32 x i16> @test_masked_i16_to_32_mask3(i16 %s, <32 x i16> %default, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_i16_to_32_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastw %edi, %zmm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i16> undef, i16 %s, i32 0 @@ -693,8 +646,7 @@ define <32 x i16> @test_masked_i16_to_32_mask3(i16 %s, <32 x i16> %default, <32 define <32 x i16> @test_masked_z_i16_to_32_mask3(i16 %s, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_i16_to_32_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpbroadcastw %edi, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i16> undef, i16 %s, i32 0 @@ -715,8 +667,7 @@ define <4 x i32> @test_i32_to_4(i32 %s) { define <4 x i32> @test_masked_i32_to_4_mask0(i32 %s, <4 x i32> %default, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_i32_to_4_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastd %edi, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i32> undef, i32 %s, i32 0 @@ -729,8 +680,7 @@ define <4 x i32> @test_masked_i32_to_4_mask0(i32 %s, <4 x i32> %default, <4 x i3 define <4 x i32> @test_masked_z_i32_to_4_mask0(i32 %s, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_i32_to_4_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpbroadcastd %edi, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i32> undef, i32 %s, i32 0 @@ -742,8 +692,7 @@ define <4 x i32> @test_masked_z_i32_to_4_mask0(i32 %s, <4 x i32> %mask) { define <4 x i32> @test_masked_i32_to_4_mask1(i32 %s, <4 x i32> %default, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_i32_to_4_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastd %edi, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i32> undef, i32 %s, i32 0 @@ -756,8 +705,7 @@ define <4 x i32> @test_masked_i32_to_4_mask1(i32 %s, <4 x i32> %default, <4 x i3 define <4 x i32> @test_masked_z_i32_to_4_mask1(i32 %s, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_i32_to_4_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpbroadcastd %edi, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i32> undef, i32 %s, i32 0 @@ -769,8 +717,7 @@ define <4 x i32> @test_masked_z_i32_to_4_mask1(i32 %s, <4 x i32> %mask) { define <4 x i32> @test_masked_i32_to_4_mask2(i32 %s, <4 x i32> %default, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_i32_to_4_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastd %edi, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i32> undef, i32 %s, i32 0 @@ -783,8 +730,7 @@ define <4 x i32> @test_masked_i32_to_4_mask2(i32 %s, <4 x i32> %default, <4 x i3 define <4 x i32> @test_masked_z_i32_to_4_mask2(i32 %s, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_i32_to_4_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpbroadcastd %edi, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i32> undef, i32 %s, i32 0 @@ -796,8 +742,7 @@ define <4 x i32> @test_masked_z_i32_to_4_mask2(i32 %s, <4 x i32> %mask) { define <4 x i32> @test_masked_i32_to_4_mask3(i32 %s, <4 x i32> %default, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_i32_to_4_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastd %edi, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i32> undef, i32 %s, i32 0 @@ -810,8 +755,7 @@ define <4 x i32> @test_masked_i32_to_4_mask3(i32 %s, <4 x i32> %default, <4 x i3 define <4 x i32> @test_masked_z_i32_to_4_mask3(i32 %s, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_i32_to_4_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpbroadcastd %edi, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i32> undef, i32 %s, i32 0 @@ -832,8 +776,7 @@ define <8 x i32> @test_i32_to_8(i32 %s) { define <8 x i32> @test_masked_i32_to_8_mask0(i32 %s, <8 x i32> %default, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_i32_to_8_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastd %edi, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i32> undef, i32 %s, i32 0 @@ -846,8 +789,7 @@ define <8 x i32> @test_masked_i32_to_8_mask0(i32 %s, <8 x i32> %default, <8 x i3 define <8 x i32> @test_masked_z_i32_to_8_mask0(i32 %s, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_i32_to_8_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpbroadcastd %edi, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i32> undef, i32 %s, i32 0 @@ -859,8 +801,7 @@ define <8 x i32> @test_masked_z_i32_to_8_mask0(i32 %s, <8 x i32> %mask) { define <8 x i32> @test_masked_i32_to_8_mask1(i32 %s, <8 x i32> %default, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_i32_to_8_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastd %edi, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i32> undef, i32 %s, i32 0 @@ -873,8 +814,7 @@ define <8 x i32> @test_masked_i32_to_8_mask1(i32 %s, <8 x i32> %default, <8 x i3 define <8 x i32> @test_masked_z_i32_to_8_mask1(i32 %s, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_i32_to_8_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpbroadcastd %edi, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i32> undef, i32 %s, i32 0 @@ -886,8 +826,7 @@ define <8 x i32> @test_masked_z_i32_to_8_mask1(i32 %s, <8 x i32> %mask) { define <8 x i32> @test_masked_i32_to_8_mask2(i32 %s, <8 x i32> %default, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_i32_to_8_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastd %edi, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i32> undef, i32 %s, i32 0 @@ -900,8 +839,7 @@ define <8 x i32> @test_masked_i32_to_8_mask2(i32 %s, <8 x i32> %default, <8 x i3 define <8 x i32> @test_masked_z_i32_to_8_mask2(i32 %s, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_i32_to_8_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpbroadcastd %edi, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i32> undef, i32 %s, i32 0 @@ -913,8 +851,7 @@ define <8 x i32> @test_masked_z_i32_to_8_mask2(i32 %s, <8 x i32> %mask) { define <8 x i32> @test_masked_i32_to_8_mask3(i32 %s, <8 x i32> %default, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_i32_to_8_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastd %edi, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i32> undef, i32 %s, i32 0 @@ -927,8 +864,7 @@ define <8 x i32> @test_masked_i32_to_8_mask3(i32 %s, <8 x i32> %default, <8 x i3 define <8 x i32> @test_masked_z_i32_to_8_mask3(i32 %s, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_i32_to_8_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpbroadcastd %edi, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i32> undef, i32 %s, i32 0 @@ -949,8 +885,7 @@ define <16 x i32> @test_i32_to_16(i32 %s) { define <16 x i32> @test_masked_i32_to_16_mask0(i32 %s, <16 x i32> %default, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_i32_to_16_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i32> undef, i32 %s, i32 0 @@ -963,8 +898,7 @@ define <16 x i32> @test_masked_i32_to_16_mask0(i32 %s, <16 x i32> %default, <16 define <16 x i32> @test_masked_z_i32_to_16_mask0(i32 %s, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_i32_to_16_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i32> undef, i32 %s, i32 0 @@ -976,8 +910,7 @@ define <16 x i32> @test_masked_z_i32_to_16_mask0(i32 %s, <16 x i32> %mask) { define <16 x i32> @test_masked_i32_to_16_mask1(i32 %s, <16 x i32> %default, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_i32_to_16_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i32> undef, i32 %s, i32 0 @@ -990,8 +923,7 @@ define <16 x i32> @test_masked_i32_to_16_mask1(i32 %s, <16 x i32> %default, <16 define <16 x i32> @test_masked_z_i32_to_16_mask1(i32 %s, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_i32_to_16_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i32> undef, i32 %s, i32 0 @@ -1003,8 +935,7 @@ define <16 x i32> @test_masked_z_i32_to_16_mask1(i32 %s, <16 x i32> %mask) { define <16 x i32> @test_masked_i32_to_16_mask2(i32 %s, <16 x i32> %default, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_i32_to_16_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i32> undef, i32 %s, i32 0 @@ -1017,8 +948,7 @@ define <16 x i32> @test_masked_i32_to_16_mask2(i32 %s, <16 x i32> %default, <16 define <16 x i32> @test_masked_z_i32_to_16_mask2(i32 %s, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_i32_to_16_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i32> undef, i32 %s, i32 0 @@ -1030,8 +960,7 @@ define <16 x i32> @test_masked_z_i32_to_16_mask2(i32 %s, <16 x i32> %mask) { define <16 x i32> @test_masked_i32_to_16_mask3(i32 %s, <16 x i32> %default, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_i32_to_16_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i32> undef, i32 %s, i32 0 @@ -1044,8 +973,7 @@ define <16 x i32> @test_masked_i32_to_16_mask3(i32 %s, <16 x i32> %default, <16 define <16 x i32> @test_masked_z_i32_to_16_mask3(i32 %s, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_i32_to_16_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i32> undef, i32 %s, i32 0 @@ -1066,8 +994,7 @@ define <2 x i64> @test_i64_to_2(i64 %s) { define <2 x i64> @test_masked_i64_to_2_mask0(i64 %s, <2 x i64> %default, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_i64_to_2_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastq %rdi, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i64> undef, i64 %s, i32 0 @@ -1080,8 +1007,7 @@ define <2 x i64> @test_masked_i64_to_2_mask0(i64 %s, <2 x i64> %default, <2 x i6 define <2 x i64> @test_masked_z_i64_to_2_mask0(i64 %s, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_z_i64_to_2_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpbroadcastq %rdi, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i64> undef, i64 %s, i32 0 @@ -1093,8 +1019,7 @@ define <2 x i64> @test_masked_z_i64_to_2_mask0(i64 %s, <2 x i64> %mask) { define <2 x i64> @test_masked_i64_to_2_mask1(i64 %s, <2 x i64> %default, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_i64_to_2_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastq %rdi, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i64> undef, i64 %s, i32 0 @@ -1107,8 +1032,7 @@ define <2 x i64> @test_masked_i64_to_2_mask1(i64 %s, <2 x i64> %default, <2 x i6 define <2 x i64> @test_masked_z_i64_to_2_mask1(i64 %s, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_z_i64_to_2_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpbroadcastq %rdi, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i64> undef, i64 %s, i32 0 @@ -1129,8 +1053,7 @@ define <4 x i64> @test_i64_to_4(i64 %s) { define <4 x i64> @test_masked_i64_to_4_mask0(i64 %s, <4 x i64> %default, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_i64_to_4_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastq %rdi, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i64> undef, i64 %s, i32 0 @@ -1143,8 +1066,7 @@ define <4 x i64> @test_masked_i64_to_4_mask0(i64 %s, <4 x i64> %default, <4 x i6 define <4 x i64> @test_masked_z_i64_to_4_mask0(i64 %s, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_i64_to_4_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpbroadcastq %rdi, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i64> undef, i64 %s, i32 0 @@ -1156,8 +1078,7 @@ define <4 x i64> @test_masked_z_i64_to_4_mask0(i64 %s, <4 x i64> %mask) { define <4 x i64> @test_masked_i64_to_4_mask1(i64 %s, <4 x i64> %default, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_i64_to_4_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastq %rdi, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i64> undef, i64 %s, i32 0 @@ -1170,8 +1091,7 @@ define <4 x i64> @test_masked_i64_to_4_mask1(i64 %s, <4 x i64> %default, <4 x i6 define <4 x i64> @test_masked_z_i64_to_4_mask1(i64 %s, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_i64_to_4_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpbroadcastq %rdi, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i64> undef, i64 %s, i32 0 @@ -1183,8 +1103,7 @@ define <4 x i64> @test_masked_z_i64_to_4_mask1(i64 %s, <4 x i64> %mask) { define <4 x i64> @test_masked_i64_to_4_mask2(i64 %s, <4 x i64> %default, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_i64_to_4_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastq %rdi, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i64> undef, i64 %s, i32 0 @@ -1197,8 +1116,7 @@ define <4 x i64> @test_masked_i64_to_4_mask2(i64 %s, <4 x i64> %default, <4 x i6 define <4 x i64> @test_masked_z_i64_to_4_mask2(i64 %s, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_i64_to_4_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpbroadcastq %rdi, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i64> undef, i64 %s, i32 0 @@ -1210,8 +1128,7 @@ define <4 x i64> @test_masked_z_i64_to_4_mask2(i64 %s, <4 x i64> %mask) { define <4 x i64> @test_masked_i64_to_4_mask3(i64 %s, <4 x i64> %default, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_i64_to_4_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastq %rdi, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i64> undef, i64 %s, i32 0 @@ -1224,8 +1141,7 @@ define <4 x i64> @test_masked_i64_to_4_mask3(i64 %s, <4 x i64> %default, <4 x i6 define <4 x i64> @test_masked_z_i64_to_4_mask3(i64 %s, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_i64_to_4_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpbroadcastq %rdi, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i64> undef, i64 %s, i32 0 @@ -1246,8 +1162,7 @@ define <8 x i64> @test_i64_to_8(i64 %s) { define <8 x i64> @test_masked_i64_to_8_mask0(i64 %s, <8 x i64> %default, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_i64_to_8_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i64> undef, i64 %s, i32 0 @@ -1260,8 +1175,7 @@ define <8 x i64> @test_masked_i64_to_8_mask0(i64 %s, <8 x i64> %default, <8 x i6 define <8 x i64> @test_masked_z_i64_to_8_mask0(i64 %s, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_i64_to_8_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i64> undef, i64 %s, i32 0 @@ -1273,8 +1187,7 @@ define <8 x i64> @test_masked_z_i64_to_8_mask0(i64 %s, <8 x i64> %mask) { define <8 x i64> @test_masked_i64_to_8_mask1(i64 %s, <8 x i64> %default, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_i64_to_8_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i64> undef, i64 %s, i32 0 @@ -1287,8 +1200,7 @@ define <8 x i64> @test_masked_i64_to_8_mask1(i64 %s, <8 x i64> %default, <8 x i6 define <8 x i64> @test_masked_z_i64_to_8_mask1(i64 %s, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_i64_to_8_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i64> undef, i64 %s, i32 0 @@ -1300,8 +1212,7 @@ define <8 x i64> @test_masked_z_i64_to_8_mask1(i64 %s, <8 x i64> %mask) { define <8 x i64> @test_masked_i64_to_8_mask2(i64 %s, <8 x i64> %default, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_i64_to_8_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i64> undef, i64 %s, i32 0 @@ -1314,8 +1225,7 @@ define <8 x i64> @test_masked_i64_to_8_mask2(i64 %s, <8 x i64> %default, <8 x i6 define <8 x i64> @test_masked_z_i64_to_8_mask2(i64 %s, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_i64_to_8_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i64> undef, i64 %s, i32 0 @@ -1327,8 +1237,7 @@ define <8 x i64> @test_masked_z_i64_to_8_mask2(i64 %s, <8 x i64> %mask) { define <8 x i64> @test_masked_i64_to_8_mask3(i64 %s, <8 x i64> %default, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_i64_to_8_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1} ; CHECK-NEXT: retq %vec = insertelement <2 x i64> undef, i64 %s, i32 0 @@ -1341,8 +1250,7 @@ define <8 x i64> @test_masked_i64_to_8_mask3(i64 %s, <8 x i64> %default, <8 x i6 define <8 x i64> @test_masked_z_i64_to_8_mask3(i64 %s, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_i64_to_8_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = insertelement <2 x i64> undef, i64 %s, i32 0 @@ -1364,8 +1272,7 @@ define <16 x i8> @test_i8_to_16_mem(i8* %p) { define <16 x i8> @test_masked_i8_to_16_mem_mask0(i8* %p, <16 x i8> %default, <16 x i8> %mask) { ; CHECK-LABEL: test_masked_i8_to_16_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmb %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastb (%rdi), %xmm0 {%k1} ; CHECK-NEXT: retq %s = load i8, i8* %p @@ -1379,8 +1286,7 @@ define <16 x i8> @test_masked_i8_to_16_mem_mask0(i8* %p, <16 x i8> %default, <16 define <16 x i8> @test_masked_z_i8_to_16_mem_mask0(i8* %p, <16 x i8> %mask) { ; CHECK-LABEL: test_masked_z_i8_to_16_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmb %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpbroadcastb (%rdi), %xmm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i8, i8* %p @@ -1393,8 +1299,7 @@ define <16 x i8> @test_masked_z_i8_to_16_mem_mask0(i8* %p, <16 x i8> %mask) { define <16 x i8> @test_masked_i8_to_16_mem_mask1(i8* %p, <16 x i8> %default, <16 x i8> %mask) { ; CHECK-LABEL: test_masked_i8_to_16_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmb %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastb (%rdi), %xmm0 {%k1} ; CHECK-NEXT: retq %s = load i8, i8* %p @@ -1408,8 +1313,7 @@ define <16 x i8> @test_masked_i8_to_16_mem_mask1(i8* %p, <16 x i8> %default, <16 define <16 x i8> @test_masked_z_i8_to_16_mem_mask1(i8* %p, <16 x i8> %mask) { ; CHECK-LABEL: test_masked_z_i8_to_16_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmb %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpbroadcastb (%rdi), %xmm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i8, i8* %p @@ -1422,8 +1326,7 @@ define <16 x i8> @test_masked_z_i8_to_16_mem_mask1(i8* %p, <16 x i8> %mask) { define <16 x i8> @test_masked_i8_to_16_mem_mask2(i8* %p, <16 x i8> %default, <16 x i8> %mask) { ; CHECK-LABEL: test_masked_i8_to_16_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmb %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastb (%rdi), %xmm0 {%k1} ; CHECK-NEXT: retq %s = load i8, i8* %p @@ -1437,8 +1340,7 @@ define <16 x i8> @test_masked_i8_to_16_mem_mask2(i8* %p, <16 x i8> %default, <16 define <16 x i8> @test_masked_z_i8_to_16_mem_mask2(i8* %p, <16 x i8> %mask) { ; CHECK-LABEL: test_masked_z_i8_to_16_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmb %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpbroadcastb (%rdi), %xmm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i8, i8* %p @@ -1451,8 +1353,7 @@ define <16 x i8> @test_masked_z_i8_to_16_mem_mask2(i8* %p, <16 x i8> %mask) { define <16 x i8> @test_masked_i8_to_16_mem_mask3(i8* %p, <16 x i8> %default, <16 x i8> %mask) { ; CHECK-LABEL: test_masked_i8_to_16_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmb %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastb (%rdi), %xmm0 {%k1} ; CHECK-NEXT: retq %s = load i8, i8* %p @@ -1466,8 +1367,7 @@ define <16 x i8> @test_masked_i8_to_16_mem_mask3(i8* %p, <16 x i8> %default, <16 define <16 x i8> @test_masked_z_i8_to_16_mem_mask3(i8* %p, <16 x i8> %mask) { ; CHECK-LABEL: test_masked_z_i8_to_16_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmb %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpbroadcastb (%rdi), %xmm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i8, i8* %p @@ -1490,8 +1390,7 @@ define <32 x i8> @test_i8_to_32_mem(i8* %p) { define <32 x i8> @test_masked_i8_to_32_mem_mask0(i8* %p, <32 x i8> %default, <32 x i8> %mask) { ; CHECK-LABEL: test_masked_i8_to_32_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1} ; CHECK-NEXT: retq %s = load i8, i8* %p @@ -1505,8 +1404,7 @@ define <32 x i8> @test_masked_i8_to_32_mem_mask0(i8* %p, <32 x i8> %default, <32 define <32 x i8> @test_masked_z_i8_to_32_mem_mask0(i8* %p, <32 x i8> %mask) { ; CHECK-LABEL: test_masked_z_i8_to_32_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i8, i8* %p @@ -1519,8 +1417,7 @@ define <32 x i8> @test_masked_z_i8_to_32_mem_mask0(i8* %p, <32 x i8> %mask) { define <32 x i8> @test_masked_i8_to_32_mem_mask1(i8* %p, <32 x i8> %default, <32 x i8> %mask) { ; CHECK-LABEL: test_masked_i8_to_32_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1} ; CHECK-NEXT: retq %s = load i8, i8* %p @@ -1534,8 +1431,7 @@ define <32 x i8> @test_masked_i8_to_32_mem_mask1(i8* %p, <32 x i8> %default, <32 define <32 x i8> @test_masked_z_i8_to_32_mem_mask1(i8* %p, <32 x i8> %mask) { ; CHECK-LABEL: test_masked_z_i8_to_32_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i8, i8* %p @@ -1548,8 +1444,7 @@ define <32 x i8> @test_masked_z_i8_to_32_mem_mask1(i8* %p, <32 x i8> %mask) { define <32 x i8> @test_masked_i8_to_32_mem_mask2(i8* %p, <32 x i8> %default, <32 x i8> %mask) { ; CHECK-LABEL: test_masked_i8_to_32_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1} ; CHECK-NEXT: retq %s = load i8, i8* %p @@ -1563,8 +1458,7 @@ define <32 x i8> @test_masked_i8_to_32_mem_mask2(i8* %p, <32 x i8> %default, <32 define <32 x i8> @test_masked_z_i8_to_32_mem_mask2(i8* %p, <32 x i8> %mask) { ; CHECK-LABEL: test_masked_z_i8_to_32_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i8, i8* %p @@ -1577,8 +1471,7 @@ define <32 x i8> @test_masked_z_i8_to_32_mem_mask2(i8* %p, <32 x i8> %mask) { define <32 x i8> @test_masked_i8_to_32_mem_mask3(i8* %p, <32 x i8> %default, <32 x i8> %mask) { ; CHECK-LABEL: test_masked_i8_to_32_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1} ; CHECK-NEXT: retq %s = load i8, i8* %p @@ -1592,8 +1485,7 @@ define <32 x i8> @test_masked_i8_to_32_mem_mask3(i8* %p, <32 x i8> %default, <32 define <32 x i8> @test_masked_z_i8_to_32_mem_mask3(i8* %p, <32 x i8> %mask) { ; CHECK-LABEL: test_masked_z_i8_to_32_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i8, i8* %p @@ -1616,8 +1508,7 @@ define <64 x i8> @test_i8_to_64_mem(i8* %p) { define <64 x i8> @test_masked_i8_to_64_mem_mask0(i8* %p, <64 x i8> %default, <64 x i8> %mask) { ; CHECK-LABEL: test_masked_i8_to_64_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmb %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastb (%rdi), %zmm0 {%k1} ; CHECK-NEXT: retq %s = load i8, i8* %p @@ -1631,8 +1522,7 @@ define <64 x i8> @test_masked_i8_to_64_mem_mask0(i8* %p, <64 x i8> %default, <64 define <64 x i8> @test_masked_z_i8_to_64_mem_mask0(i8* %p, <64 x i8> %mask) { ; CHECK-LABEL: test_masked_z_i8_to_64_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmb %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpbroadcastb (%rdi), %zmm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i8, i8* %p @@ -1645,8 +1535,7 @@ define <64 x i8> @test_masked_z_i8_to_64_mem_mask0(i8* %p, <64 x i8> %mask) { define <64 x i8> @test_masked_i8_to_64_mem_mask1(i8* %p, <64 x i8> %default, <64 x i8> %mask) { ; CHECK-LABEL: test_masked_i8_to_64_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmb %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastb (%rdi), %zmm0 {%k1} ; CHECK-NEXT: retq %s = load i8, i8* %p @@ -1660,8 +1549,7 @@ define <64 x i8> @test_masked_i8_to_64_mem_mask1(i8* %p, <64 x i8> %default, <64 define <64 x i8> @test_masked_z_i8_to_64_mem_mask1(i8* %p, <64 x i8> %mask) { ; CHECK-LABEL: test_masked_z_i8_to_64_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmb %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpbroadcastb (%rdi), %zmm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i8, i8* %p @@ -1674,8 +1562,7 @@ define <64 x i8> @test_masked_z_i8_to_64_mem_mask1(i8* %p, <64 x i8> %mask) { define <64 x i8> @test_masked_i8_to_64_mem_mask2(i8* %p, <64 x i8> %default, <64 x i8> %mask) { ; CHECK-LABEL: test_masked_i8_to_64_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmb %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastb (%rdi), %zmm0 {%k1} ; CHECK-NEXT: retq %s = load i8, i8* %p @@ -1689,8 +1576,7 @@ define <64 x i8> @test_masked_i8_to_64_mem_mask2(i8* %p, <64 x i8> %default, <64 define <64 x i8> @test_masked_z_i8_to_64_mem_mask2(i8* %p, <64 x i8> %mask) { ; CHECK-LABEL: test_masked_z_i8_to_64_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmb %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpbroadcastb (%rdi), %zmm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i8, i8* %p @@ -1703,8 +1589,7 @@ define <64 x i8> @test_masked_z_i8_to_64_mem_mask2(i8* %p, <64 x i8> %mask) { define <64 x i8> @test_masked_i8_to_64_mem_mask3(i8* %p, <64 x i8> %default, <64 x i8> %mask) { ; CHECK-LABEL: test_masked_i8_to_64_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmb %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastb (%rdi), %zmm0 {%k1} ; CHECK-NEXT: retq %s = load i8, i8* %p @@ -1718,8 +1603,7 @@ define <64 x i8> @test_masked_i8_to_64_mem_mask3(i8* %p, <64 x i8> %default, <64 define <64 x i8> @test_masked_z_i8_to_64_mem_mask3(i8* %p, <64 x i8> %mask) { ; CHECK-LABEL: test_masked_z_i8_to_64_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmb %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpbroadcastb (%rdi), %zmm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i8, i8* %p @@ -1742,8 +1626,7 @@ define <8 x i16> @test_i16_to_8_mem(i16* %p) { define <8 x i16> @test_masked_i16_to_8_mem_mask0(i16* %p, <8 x i16> %default, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_i16_to_8_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastw (%rdi), %xmm0 {%k1} ; CHECK-NEXT: retq %s = load i16, i16* %p @@ -1757,8 +1640,7 @@ define <8 x i16> @test_masked_i16_to_8_mem_mask0(i16* %p, <8 x i16> %default, <8 define <8 x i16> @test_masked_z_i16_to_8_mem_mask0(i16* %p, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_i16_to_8_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpbroadcastw (%rdi), %xmm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i16, i16* %p @@ -1771,8 +1653,7 @@ define <8 x i16> @test_masked_z_i16_to_8_mem_mask0(i16* %p, <8 x i16> %mask) { define <8 x i16> @test_masked_i16_to_8_mem_mask1(i16* %p, <8 x i16> %default, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_i16_to_8_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastw (%rdi), %xmm0 {%k1} ; CHECK-NEXT: retq %s = load i16, i16* %p @@ -1786,8 +1667,7 @@ define <8 x i16> @test_masked_i16_to_8_mem_mask1(i16* %p, <8 x i16> %default, <8 define <8 x i16> @test_masked_z_i16_to_8_mem_mask1(i16* %p, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_i16_to_8_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpbroadcastw (%rdi), %xmm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i16, i16* %p @@ -1800,8 +1680,7 @@ define <8 x i16> @test_masked_z_i16_to_8_mem_mask1(i16* %p, <8 x i16> %mask) { define <8 x i16> @test_masked_i16_to_8_mem_mask2(i16* %p, <8 x i16> %default, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_i16_to_8_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastw (%rdi), %xmm0 {%k1} ; CHECK-NEXT: retq %s = load i16, i16* %p @@ -1815,8 +1694,7 @@ define <8 x i16> @test_masked_i16_to_8_mem_mask2(i16* %p, <8 x i16> %default, <8 define <8 x i16> @test_masked_z_i16_to_8_mem_mask2(i16* %p, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_i16_to_8_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpbroadcastw (%rdi), %xmm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i16, i16* %p @@ -1829,8 +1707,7 @@ define <8 x i16> @test_masked_z_i16_to_8_mem_mask2(i16* %p, <8 x i16> %mask) { define <8 x i16> @test_masked_i16_to_8_mem_mask3(i16* %p, <8 x i16> %default, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_i16_to_8_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastw (%rdi), %xmm0 {%k1} ; CHECK-NEXT: retq %s = load i16, i16* %p @@ -1844,8 +1721,7 @@ define <8 x i16> @test_masked_i16_to_8_mem_mask3(i16* %p, <8 x i16> %default, <8 define <8 x i16> @test_masked_z_i16_to_8_mem_mask3(i16* %p, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_i16_to_8_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpbroadcastw (%rdi), %xmm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i16, i16* %p @@ -1868,8 +1744,7 @@ define <16 x i16> @test_i16_to_16_mem(i16* %p) { define <16 x i16> @test_masked_i16_to_16_mem_mask0(i16* %p, <16 x i16> %default, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_i16_to_16_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastw (%rdi), %ymm0 {%k1} ; CHECK-NEXT: retq %s = load i16, i16* %p @@ -1883,8 +1758,7 @@ define <16 x i16> @test_masked_i16_to_16_mem_mask0(i16* %p, <16 x i16> %default, define <16 x i16> @test_masked_z_i16_to_16_mem_mask0(i16* %p, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_i16_to_16_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpbroadcastw (%rdi), %ymm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i16, i16* %p @@ -1897,8 +1771,7 @@ define <16 x i16> @test_masked_z_i16_to_16_mem_mask0(i16* %p, <16 x i16> %mask) define <16 x i16> @test_masked_i16_to_16_mem_mask1(i16* %p, <16 x i16> %default, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_i16_to_16_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastw (%rdi), %ymm0 {%k1} ; CHECK-NEXT: retq %s = load i16, i16* %p @@ -1912,8 +1785,7 @@ define <16 x i16> @test_masked_i16_to_16_mem_mask1(i16* %p, <16 x i16> %default, define <16 x i16> @test_masked_z_i16_to_16_mem_mask1(i16* %p, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_i16_to_16_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpbroadcastw (%rdi), %ymm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i16, i16* %p @@ -1926,8 +1798,7 @@ define <16 x i16> @test_masked_z_i16_to_16_mem_mask1(i16* %p, <16 x i16> %mask) define <16 x i16> @test_masked_i16_to_16_mem_mask2(i16* %p, <16 x i16> %default, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_i16_to_16_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastw (%rdi), %ymm0 {%k1} ; CHECK-NEXT: retq %s = load i16, i16* %p @@ -1941,8 +1812,7 @@ define <16 x i16> @test_masked_i16_to_16_mem_mask2(i16* %p, <16 x i16> %default, define <16 x i16> @test_masked_z_i16_to_16_mem_mask2(i16* %p, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_i16_to_16_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpbroadcastw (%rdi), %ymm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i16, i16* %p @@ -1955,8 +1825,7 @@ define <16 x i16> @test_masked_z_i16_to_16_mem_mask2(i16* %p, <16 x i16> %mask) define <16 x i16> @test_masked_i16_to_16_mem_mask3(i16* %p, <16 x i16> %default, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_i16_to_16_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastw (%rdi), %ymm0 {%k1} ; CHECK-NEXT: retq %s = load i16, i16* %p @@ -1970,8 +1839,7 @@ define <16 x i16> @test_masked_i16_to_16_mem_mask3(i16* %p, <16 x i16> %default, define <16 x i16> @test_masked_z_i16_to_16_mem_mask3(i16* %p, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_i16_to_16_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpbroadcastw (%rdi), %ymm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i16, i16* %p @@ -1994,8 +1862,7 @@ define <32 x i16> @test_i16_to_32_mem(i16* %p) { define <32 x i16> @test_masked_i16_to_32_mem_mask0(i16* %p, <32 x i16> %default, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_i16_to_32_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastw (%rdi), %zmm0 {%k1} ; CHECK-NEXT: retq %s = load i16, i16* %p @@ -2009,8 +1876,7 @@ define <32 x i16> @test_masked_i16_to_32_mem_mask0(i16* %p, <32 x i16> %default, define <32 x i16> @test_masked_z_i16_to_32_mem_mask0(i16* %p, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_i16_to_32_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpbroadcastw (%rdi), %zmm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i16, i16* %p @@ -2023,8 +1889,7 @@ define <32 x i16> @test_masked_z_i16_to_32_mem_mask0(i16* %p, <32 x i16> %mask) define <32 x i16> @test_masked_i16_to_32_mem_mask1(i16* %p, <32 x i16> %default, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_i16_to_32_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastw (%rdi), %zmm0 {%k1} ; CHECK-NEXT: retq %s = load i16, i16* %p @@ -2038,8 +1903,7 @@ define <32 x i16> @test_masked_i16_to_32_mem_mask1(i16* %p, <32 x i16> %default, define <32 x i16> @test_masked_z_i16_to_32_mem_mask1(i16* %p, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_i16_to_32_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpbroadcastw (%rdi), %zmm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i16, i16* %p @@ -2052,8 +1916,7 @@ define <32 x i16> @test_masked_z_i16_to_32_mem_mask1(i16* %p, <32 x i16> %mask) define <32 x i16> @test_masked_i16_to_32_mem_mask2(i16* %p, <32 x i16> %default, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_i16_to_32_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastw (%rdi), %zmm0 {%k1} ; CHECK-NEXT: retq %s = load i16, i16* %p @@ -2067,8 +1930,7 @@ define <32 x i16> @test_masked_i16_to_32_mem_mask2(i16* %p, <32 x i16> %default, define <32 x i16> @test_masked_z_i16_to_32_mem_mask2(i16* %p, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_i16_to_32_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpbroadcastw (%rdi), %zmm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i16, i16* %p @@ -2081,8 +1943,7 @@ define <32 x i16> @test_masked_z_i16_to_32_mem_mask2(i16* %p, <32 x i16> %mask) define <32 x i16> @test_masked_i16_to_32_mem_mask3(i16* %p, <32 x i16> %default, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_i16_to_32_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastw (%rdi), %zmm0 {%k1} ; CHECK-NEXT: retq %s = load i16, i16* %p @@ -2096,8 +1957,7 @@ define <32 x i16> @test_masked_i16_to_32_mem_mask3(i16* %p, <32 x i16> %default, define <32 x i16> @test_masked_z_i16_to_32_mem_mask3(i16* %p, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_i16_to_32_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpbroadcastw (%rdi), %zmm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i16, i16* %p @@ -2120,8 +1980,7 @@ define <4 x i32> @test_i32_to_4_mem(i32* %p) { define <4 x i32> @test_masked_i32_to_4_mem_mask0(i32* %p, <4 x i32> %default, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_i32_to_4_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastd (%rdi), %xmm0 {%k1} ; CHECK-NEXT: retq %s = load i32, i32* %p @@ -2135,8 +1994,7 @@ define <4 x i32> @test_masked_i32_to_4_mem_mask0(i32* %p, <4 x i32> %default, <4 define <4 x i32> @test_masked_z_i32_to_4_mem_mask0(i32* %p, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_i32_to_4_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpbroadcastd (%rdi), %xmm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i32, i32* %p @@ -2149,8 +2007,7 @@ define <4 x i32> @test_masked_z_i32_to_4_mem_mask0(i32* %p, <4 x i32> %mask) { define <4 x i32> @test_masked_i32_to_4_mem_mask1(i32* %p, <4 x i32> %default, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_i32_to_4_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastd (%rdi), %xmm0 {%k1} ; CHECK-NEXT: retq %s = load i32, i32* %p @@ -2164,8 +2021,7 @@ define <4 x i32> @test_masked_i32_to_4_mem_mask1(i32* %p, <4 x i32> %default, <4 define <4 x i32> @test_masked_z_i32_to_4_mem_mask1(i32* %p, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_i32_to_4_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpbroadcastd (%rdi), %xmm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i32, i32* %p @@ -2178,8 +2034,7 @@ define <4 x i32> @test_masked_z_i32_to_4_mem_mask1(i32* %p, <4 x i32> %mask) { define <4 x i32> @test_masked_i32_to_4_mem_mask2(i32* %p, <4 x i32> %default, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_i32_to_4_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastd (%rdi), %xmm0 {%k1} ; CHECK-NEXT: retq %s = load i32, i32* %p @@ -2193,8 +2048,7 @@ define <4 x i32> @test_masked_i32_to_4_mem_mask2(i32* %p, <4 x i32> %default, <4 define <4 x i32> @test_masked_z_i32_to_4_mem_mask2(i32* %p, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_i32_to_4_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpbroadcastd (%rdi), %xmm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i32, i32* %p @@ -2207,8 +2061,7 @@ define <4 x i32> @test_masked_z_i32_to_4_mem_mask2(i32* %p, <4 x i32> %mask) { define <4 x i32> @test_masked_i32_to_4_mem_mask3(i32* %p, <4 x i32> %default, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_i32_to_4_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastd (%rdi), %xmm0 {%k1} ; CHECK-NEXT: retq %s = load i32, i32* %p @@ -2222,8 +2075,7 @@ define <4 x i32> @test_masked_i32_to_4_mem_mask3(i32* %p, <4 x i32> %default, <4 define <4 x i32> @test_masked_z_i32_to_4_mem_mask3(i32* %p, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_i32_to_4_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpbroadcastd (%rdi), %xmm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i32, i32* %p @@ -2246,8 +2098,7 @@ define <8 x i32> @test_i32_to_8_mem(i32* %p) { define <8 x i32> @test_masked_i32_to_8_mem_mask0(i32* %p, <8 x i32> %default, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_i32_to_8_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastd (%rdi), %ymm0 {%k1} ; CHECK-NEXT: retq %s = load i32, i32* %p @@ -2261,8 +2112,7 @@ define <8 x i32> @test_masked_i32_to_8_mem_mask0(i32* %p, <8 x i32> %default, <8 define <8 x i32> @test_masked_z_i32_to_8_mem_mask0(i32* %p, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_i32_to_8_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpbroadcastd (%rdi), %ymm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i32, i32* %p @@ -2275,8 +2125,7 @@ define <8 x i32> @test_masked_z_i32_to_8_mem_mask0(i32* %p, <8 x i32> %mask) { define <8 x i32> @test_masked_i32_to_8_mem_mask1(i32* %p, <8 x i32> %default, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_i32_to_8_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastd (%rdi), %ymm0 {%k1} ; CHECK-NEXT: retq %s = load i32, i32* %p @@ -2290,8 +2139,7 @@ define <8 x i32> @test_masked_i32_to_8_mem_mask1(i32* %p, <8 x i32> %default, <8 define <8 x i32> @test_masked_z_i32_to_8_mem_mask1(i32* %p, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_i32_to_8_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpbroadcastd (%rdi), %ymm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i32, i32* %p @@ -2304,8 +2152,7 @@ define <8 x i32> @test_masked_z_i32_to_8_mem_mask1(i32* %p, <8 x i32> %mask) { define <8 x i32> @test_masked_i32_to_8_mem_mask2(i32* %p, <8 x i32> %default, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_i32_to_8_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastd (%rdi), %ymm0 {%k1} ; CHECK-NEXT: retq %s = load i32, i32* %p @@ -2319,8 +2166,7 @@ define <8 x i32> @test_masked_i32_to_8_mem_mask2(i32* %p, <8 x i32> %default, <8 define <8 x i32> @test_masked_z_i32_to_8_mem_mask2(i32* %p, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_i32_to_8_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpbroadcastd (%rdi), %ymm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i32, i32* %p @@ -2333,8 +2179,7 @@ define <8 x i32> @test_masked_z_i32_to_8_mem_mask2(i32* %p, <8 x i32> %mask) { define <8 x i32> @test_masked_i32_to_8_mem_mask3(i32* %p, <8 x i32> %default, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_i32_to_8_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastd (%rdi), %ymm0 {%k1} ; CHECK-NEXT: retq %s = load i32, i32* %p @@ -2348,8 +2193,7 @@ define <8 x i32> @test_masked_i32_to_8_mem_mask3(i32* %p, <8 x i32> %default, <8 define <8 x i32> @test_masked_z_i32_to_8_mem_mask3(i32* %p, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_i32_to_8_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpbroadcastd (%rdi), %ymm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i32, i32* %p @@ -2372,8 +2216,7 @@ define <16 x i32> @test_i32_to_16_mem(i32* %p) { define <16 x i32> @test_masked_i32_to_16_mem_mask0(i32* %p, <16 x i32> %default, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_i32_to_16_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastd (%rdi), %zmm0 {%k1} ; CHECK-NEXT: retq %s = load i32, i32* %p @@ -2387,8 +2230,7 @@ define <16 x i32> @test_masked_i32_to_16_mem_mask0(i32* %p, <16 x i32> %default, define <16 x i32> @test_masked_z_i32_to_16_mem_mask0(i32* %p, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_i32_to_16_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpbroadcastd (%rdi), %zmm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i32, i32* %p @@ -2401,8 +2243,7 @@ define <16 x i32> @test_masked_z_i32_to_16_mem_mask0(i32* %p, <16 x i32> %mask) define <16 x i32> @test_masked_i32_to_16_mem_mask1(i32* %p, <16 x i32> %default, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_i32_to_16_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastd (%rdi), %zmm0 {%k1} ; CHECK-NEXT: retq %s = load i32, i32* %p @@ -2416,8 +2257,7 @@ define <16 x i32> @test_masked_i32_to_16_mem_mask1(i32* %p, <16 x i32> %default, define <16 x i32> @test_masked_z_i32_to_16_mem_mask1(i32* %p, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_i32_to_16_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpbroadcastd (%rdi), %zmm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i32, i32* %p @@ -2430,8 +2270,7 @@ define <16 x i32> @test_masked_z_i32_to_16_mem_mask1(i32* %p, <16 x i32> %mask) define <16 x i32> @test_masked_i32_to_16_mem_mask2(i32* %p, <16 x i32> %default, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_i32_to_16_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastd (%rdi), %zmm0 {%k1} ; CHECK-NEXT: retq %s = load i32, i32* %p @@ -2445,8 +2284,7 @@ define <16 x i32> @test_masked_i32_to_16_mem_mask2(i32* %p, <16 x i32> %default, define <16 x i32> @test_masked_z_i32_to_16_mem_mask2(i32* %p, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_i32_to_16_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpbroadcastd (%rdi), %zmm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i32, i32* %p @@ -2459,8 +2297,7 @@ define <16 x i32> @test_masked_z_i32_to_16_mem_mask2(i32* %p, <16 x i32> %mask) define <16 x i32> @test_masked_i32_to_16_mem_mask3(i32* %p, <16 x i32> %default, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_i32_to_16_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastd (%rdi), %zmm0 {%k1} ; CHECK-NEXT: retq %s = load i32, i32* %p @@ -2474,8 +2311,7 @@ define <16 x i32> @test_masked_i32_to_16_mem_mask3(i32* %p, <16 x i32> %default, define <16 x i32> @test_masked_z_i32_to_16_mem_mask3(i32* %p, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_i32_to_16_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpbroadcastd (%rdi), %zmm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i32, i32* %p @@ -2498,8 +2334,7 @@ define <2 x i64> @test_i64_to_2_mem(i64* %p) { define <2 x i64> @test_masked_i64_to_2_mem_mask0(i64* %p, <2 x i64> %default, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_i64_to_2_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastq (%rdi), %xmm0 {%k1} ; CHECK-NEXT: retq %s = load i64, i64* %p @@ -2513,8 +2348,7 @@ define <2 x i64> @test_masked_i64_to_2_mem_mask0(i64* %p, <2 x i64> %default, <2 define <2 x i64> @test_masked_z_i64_to_2_mem_mask0(i64* %p, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_z_i64_to_2_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpbroadcastq (%rdi), %xmm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i64, i64* %p @@ -2527,8 +2361,7 @@ define <2 x i64> @test_masked_z_i64_to_2_mem_mask0(i64* %p, <2 x i64> %mask) { define <2 x i64> @test_masked_i64_to_2_mem_mask1(i64* %p, <2 x i64> %default, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_i64_to_2_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastq (%rdi), %xmm0 {%k1} ; CHECK-NEXT: retq %s = load i64, i64* %p @@ -2542,8 +2375,7 @@ define <2 x i64> @test_masked_i64_to_2_mem_mask1(i64* %p, <2 x i64> %default, <2 define <2 x i64> @test_masked_z_i64_to_2_mem_mask1(i64* %p, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_z_i64_to_2_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpbroadcastq (%rdi), %xmm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i64, i64* %p @@ -2566,8 +2398,7 @@ define <4 x i64> @test_i64_to_4_mem(i64* %p) { define <4 x i64> @test_masked_i64_to_4_mem_mask0(i64* %p, <4 x i64> %default, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_i64_to_4_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastq (%rdi), %ymm0 {%k1} ; CHECK-NEXT: retq %s = load i64, i64* %p @@ -2581,8 +2412,7 @@ define <4 x i64> @test_masked_i64_to_4_mem_mask0(i64* %p, <4 x i64> %default, <4 define <4 x i64> @test_masked_z_i64_to_4_mem_mask0(i64* %p, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_i64_to_4_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpbroadcastq (%rdi), %ymm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i64, i64* %p @@ -2595,8 +2425,7 @@ define <4 x i64> @test_masked_z_i64_to_4_mem_mask0(i64* %p, <4 x i64> %mask) { define <4 x i64> @test_masked_i64_to_4_mem_mask1(i64* %p, <4 x i64> %default, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_i64_to_4_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastq (%rdi), %ymm0 {%k1} ; CHECK-NEXT: retq %s = load i64, i64* %p @@ -2610,8 +2439,7 @@ define <4 x i64> @test_masked_i64_to_4_mem_mask1(i64* %p, <4 x i64> %default, <4 define <4 x i64> @test_masked_z_i64_to_4_mem_mask1(i64* %p, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_i64_to_4_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpbroadcastq (%rdi), %ymm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i64, i64* %p @@ -2624,8 +2452,7 @@ define <4 x i64> @test_masked_z_i64_to_4_mem_mask1(i64* %p, <4 x i64> %mask) { define <4 x i64> @test_masked_i64_to_4_mem_mask2(i64* %p, <4 x i64> %default, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_i64_to_4_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastq (%rdi), %ymm0 {%k1} ; CHECK-NEXT: retq %s = load i64, i64* %p @@ -2639,8 +2466,7 @@ define <4 x i64> @test_masked_i64_to_4_mem_mask2(i64* %p, <4 x i64> %default, <4 define <4 x i64> @test_masked_z_i64_to_4_mem_mask2(i64* %p, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_i64_to_4_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpbroadcastq (%rdi), %ymm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i64, i64* %p @@ -2653,8 +2479,7 @@ define <4 x i64> @test_masked_z_i64_to_4_mem_mask2(i64* %p, <4 x i64> %mask) { define <4 x i64> @test_masked_i64_to_4_mem_mask3(i64* %p, <4 x i64> %default, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_i64_to_4_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastq (%rdi), %ymm0 {%k1} ; CHECK-NEXT: retq %s = load i64, i64* %p @@ -2668,8 +2493,7 @@ define <4 x i64> @test_masked_i64_to_4_mem_mask3(i64* %p, <4 x i64> %default, <4 define <4 x i64> @test_masked_z_i64_to_4_mem_mask3(i64* %p, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_i64_to_4_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpbroadcastq (%rdi), %ymm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i64, i64* %p @@ -2692,8 +2516,7 @@ define <8 x i64> @test_i64_to_8_mem(i64* %p) { define <8 x i64> @test_masked_i64_to_8_mem_mask0(i64* %p, <8 x i64> %default, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_i64_to_8_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastq (%rdi), %zmm0 {%k1} ; CHECK-NEXT: retq %s = load i64, i64* %p @@ -2707,8 +2530,7 @@ define <8 x i64> @test_masked_i64_to_8_mem_mask0(i64* %p, <8 x i64> %default, <8 define <8 x i64> @test_masked_z_i64_to_8_mem_mask0(i64* %p, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_i64_to_8_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpbroadcastq (%rdi), %zmm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i64, i64* %p @@ -2721,8 +2543,7 @@ define <8 x i64> @test_masked_z_i64_to_8_mem_mask0(i64* %p, <8 x i64> %mask) { define <8 x i64> @test_masked_i64_to_8_mem_mask1(i64* %p, <8 x i64> %default, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_i64_to_8_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastq (%rdi), %zmm0 {%k1} ; CHECK-NEXT: retq %s = load i64, i64* %p @@ -2736,8 +2557,7 @@ define <8 x i64> @test_masked_i64_to_8_mem_mask1(i64* %p, <8 x i64> %default, <8 define <8 x i64> @test_masked_z_i64_to_8_mem_mask1(i64* %p, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_i64_to_8_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpbroadcastq (%rdi), %zmm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i64, i64* %p @@ -2750,8 +2570,7 @@ define <8 x i64> @test_masked_z_i64_to_8_mem_mask1(i64* %p, <8 x i64> %mask) { define <8 x i64> @test_masked_i64_to_8_mem_mask2(i64* %p, <8 x i64> %default, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_i64_to_8_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastq (%rdi), %zmm0 {%k1} ; CHECK-NEXT: retq %s = load i64, i64* %p @@ -2765,8 +2584,7 @@ define <8 x i64> @test_masked_i64_to_8_mem_mask2(i64* %p, <8 x i64> %default, <8 define <8 x i64> @test_masked_z_i64_to_8_mem_mask2(i64* %p, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_i64_to_8_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpbroadcastq (%rdi), %zmm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i64, i64* %p @@ -2779,8 +2597,7 @@ define <8 x i64> @test_masked_z_i64_to_8_mem_mask2(i64* %p, <8 x i64> %mask) { define <8 x i64> @test_masked_i64_to_8_mem_mask3(i64* %p, <8 x i64> %default, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_i64_to_8_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastq (%rdi), %zmm0 {%k1} ; CHECK-NEXT: retq %s = load i64, i64* %p @@ -2794,8 +2611,7 @@ define <8 x i64> @test_masked_i64_to_8_mem_mask3(i64* %p, <8 x i64> %default, <8 define <8 x i64> @test_masked_z_i64_to_8_mem_mask3(i64* %p, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_i64_to_8_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpbroadcastq (%rdi), %zmm0 {%k1} {z} ; CHECK-NEXT: retq %s = load i64, i64* %p diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll b/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll index b6b5a6bcdca..218aa3ffe07 100644 --- a/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll @@ -14,8 +14,7 @@ define <4 x i32> @test_2xi32_to_4xi32(<4 x i32> %vec) { define <4 x i32> @test_masked_2xi32_to_4xi32_mask0(<4 x i32> %vec, <4 x i32> %default, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_4xi32_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm1 {%k1} = xmm0[0,1,0,1] ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -28,8 +27,7 @@ define <4 x i32> @test_masked_2xi32_to_4xi32_mask0(<4 x i32> %vec, <4 x i32> %de define <4 x i32> @test_masked_z_2xi32_to_4xi32_mask0(<4 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,0,1] ; CHECK-NEXT: retq %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> @@ -40,8 +38,7 @@ define <4 x i32> @test_masked_z_2xi32_to_4xi32_mask0(<4 x i32> %vec, <4 x i32> % define <4 x i32> @test_masked_2xi32_to_4xi32_mask1(<4 x i32> %vec, <4 x i32> %default, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_4xi32_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm1 {%k1} = xmm0[0,1,0,1] ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -54,8 +51,7 @@ define <4 x i32> @test_masked_2xi32_to_4xi32_mask1(<4 x i32> %vec, <4 x i32> %de define <4 x i32> @test_masked_z_2xi32_to_4xi32_mask1(<4 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,0,1] ; CHECK-NEXT: retq %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> @@ -66,8 +62,7 @@ define <4 x i32> @test_masked_z_2xi32_to_4xi32_mask1(<4 x i32> %vec, <4 x i32> % define <4 x i32> @test_masked_2xi32_to_4xi32_mask2(<4 x i32> %vec, <4 x i32> %default, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_4xi32_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm1 {%k1} = xmm0[0,1,0,1] ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -80,8 +75,7 @@ define <4 x i32> @test_masked_2xi32_to_4xi32_mask2(<4 x i32> %vec, <4 x i32> %de define <4 x i32> @test_masked_z_2xi32_to_4xi32_mask2(<4 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,0,1] ; CHECK-NEXT: retq %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> @@ -92,8 +86,7 @@ define <4 x i32> @test_masked_z_2xi32_to_4xi32_mask2(<4 x i32> %vec, <4 x i32> % define <4 x i32> @test_masked_2xi32_to_4xi32_mask3(<4 x i32> %vec, <4 x i32> %default, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_4xi32_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm1 {%k1} = xmm0[0,1,0,1] ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -106,8 +99,7 @@ define <4 x i32> @test_masked_2xi32_to_4xi32_mask3(<4 x i32> %vec, <4 x i32> %de define <4 x i32> @test_masked_z_2xi32_to_4xi32_mask3(<4 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,0,1] ; CHECK-NEXT: retq %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> @@ -126,8 +118,7 @@ define <8 x i32> @test_2xi32_to_8xi32(<8 x i32> %vec) { define <8 x i32> @test_masked_2xi32_to_8xi32_mask0(<8 x i32> %vec, <8 x i32> %default, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -140,8 +131,7 @@ define <8 x i32> @test_masked_2xi32_to_8xi32_mask0(<8 x i32> %vec, <8 x i32> %de define <8 x i32> @test_masked_z_2xi32_to_8xi32_mask0(<8 x i32> %vec, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> @@ -152,8 +142,7 @@ define <8 x i32> @test_masked_z_2xi32_to_8xi32_mask0(<8 x i32> %vec, <8 x i32> % define <8 x i32> @test_masked_2xi32_to_8xi32_mask1(<8 x i32> %vec, <8 x i32> %default, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -166,8 +155,7 @@ define <8 x i32> @test_masked_2xi32_to_8xi32_mask1(<8 x i32> %vec, <8 x i32> %de define <8 x i32> @test_masked_z_2xi32_to_8xi32_mask1(<8 x i32> %vec, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> @@ -178,8 +166,7 @@ define <8 x i32> @test_masked_z_2xi32_to_8xi32_mask1(<8 x i32> %vec, <8 x i32> % define <8 x i32> @test_masked_2xi32_to_8xi32_mask2(<8 x i32> %vec, <8 x i32> %default, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -192,8 +179,7 @@ define <8 x i32> @test_masked_2xi32_to_8xi32_mask2(<8 x i32> %vec, <8 x i32> %de define <8 x i32> @test_masked_z_2xi32_to_8xi32_mask2(<8 x i32> %vec, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> @@ -204,8 +190,7 @@ define <8 x i32> @test_masked_z_2xi32_to_8xi32_mask2(<8 x i32> %vec, <8 x i32> % define <8 x i32> @test_masked_2xi32_to_8xi32_mask3(<8 x i32> %vec, <8 x i32> %default, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -218,8 +203,7 @@ define <8 x i32> @test_masked_2xi32_to_8xi32_mask3(<8 x i32> %vec, <8 x i32> %de define <8 x i32> @test_masked_z_2xi32_to_8xi32_mask3(<8 x i32> %vec, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> @@ -238,8 +222,7 @@ define <16 x i32> @test_2xi32_to_16xi32(<16 x i32> %vec) { define <16 x i32> @test_masked_2xi32_to_16xi32_mask0(<16 x i32> %vec, <16 x i32> %default, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_16xi32_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -252,8 +235,7 @@ define <16 x i32> @test_masked_2xi32_to_16xi32_mask0(<16 x i32> %vec, <16 x i32> define <16 x i32> @test_masked_z_2xi32_to_16xi32_mask0(<16 x i32> %vec, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> @@ -264,8 +246,7 @@ define <16 x i32> @test_masked_z_2xi32_to_16xi32_mask0(<16 x i32> %vec, <16 x i3 define <16 x i32> @test_masked_2xi32_to_16xi32_mask1(<16 x i32> %vec, <16 x i32> %default, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_16xi32_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -278,8 +259,7 @@ define <16 x i32> @test_masked_2xi32_to_16xi32_mask1(<16 x i32> %vec, <16 x i32> define <16 x i32> @test_masked_z_2xi32_to_16xi32_mask1(<16 x i32> %vec, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> @@ -290,8 +270,7 @@ define <16 x i32> @test_masked_z_2xi32_to_16xi32_mask1(<16 x i32> %vec, <16 x i3 define <16 x i32> @test_masked_2xi32_to_16xi32_mask2(<16 x i32> %vec, <16 x i32> %default, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_16xi32_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -304,8 +283,7 @@ define <16 x i32> @test_masked_2xi32_to_16xi32_mask2(<16 x i32> %vec, <16 x i32> define <16 x i32> @test_masked_z_2xi32_to_16xi32_mask2(<16 x i32> %vec, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> @@ -316,8 +294,7 @@ define <16 x i32> @test_masked_z_2xi32_to_16xi32_mask2(<16 x i32> %vec, <16 x i3 define <16 x i32> @test_masked_2xi32_to_16xi32_mask3(<16 x i32> %vec, <16 x i32> %default, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_16xi32_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -330,8 +307,7 @@ define <16 x i32> @test_masked_2xi32_to_16xi32_mask3(<16 x i32> %vec, <16 x i32> define <16 x i32> @test_masked_z_2xi32_to_16xi32_mask3(<16 x i32> %vec, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> @@ -351,8 +327,7 @@ define <4 x i32> @test_2xi32_to_4xi32_mem(<2 x i32>* %vp) { define <4 x i32> @test_masked_2xi32_to_4xi32_mem_mask0(<2 x i32>* %vp, <4 x i32> %default, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_4xi32_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} = mem[0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp @@ -365,8 +340,7 @@ define <4 x i32> @test_masked_2xi32_to_4xi32_mem_mask0(<2 x i32>* %vp, <4 x i32> define <4 x i32> @test_masked_z_2xi32_to_4xi32_mem_mask0(<2 x i32>* %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = mem[0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp @@ -378,8 +352,7 @@ define <4 x i32> @test_masked_z_2xi32_to_4xi32_mem_mask0(<2 x i32>* %vp, <4 x i3 define <4 x i32> @test_masked_2xi32_to_4xi32_mem_mask1(<2 x i32>* %vp, <4 x i32> %default, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_4xi32_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} = mem[0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp @@ -392,8 +365,7 @@ define <4 x i32> @test_masked_2xi32_to_4xi32_mem_mask1(<2 x i32>* %vp, <4 x i32> define <4 x i32> @test_masked_z_2xi32_to_4xi32_mem_mask1(<2 x i32>* %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = mem[0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp @@ -405,8 +377,7 @@ define <4 x i32> @test_masked_z_2xi32_to_4xi32_mem_mask1(<2 x i32>* %vp, <4 x i3 define <4 x i32> @test_masked_2xi32_to_4xi32_mem_mask2(<2 x i32>* %vp, <4 x i32> %default, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_4xi32_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} = mem[0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp @@ -419,8 +390,7 @@ define <4 x i32> @test_masked_2xi32_to_4xi32_mem_mask2(<2 x i32>* %vp, <4 x i32> define <4 x i32> @test_masked_z_2xi32_to_4xi32_mem_mask2(<2 x i32>* %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = mem[0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp @@ -432,8 +402,7 @@ define <4 x i32> @test_masked_z_2xi32_to_4xi32_mem_mask2(<2 x i32>* %vp, <4 x i3 define <4 x i32> @test_masked_2xi32_to_4xi32_mem_mask3(<2 x i32>* %vp, <4 x i32> %default, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_4xi32_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} = mem[0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp @@ -446,8 +415,7 @@ define <4 x i32> @test_masked_2xi32_to_4xi32_mem_mask3(<2 x i32>* %vp, <4 x i32> define <4 x i32> @test_masked_z_2xi32_to_4xi32_mem_mask3(<2 x i32>* %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = mem[0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp @@ -468,8 +436,7 @@ define <8 x i32> @test_2xi32_to_8xi32_mem(<2 x i32>* %vp) { define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask0(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp @@ -482,8 +449,7 @@ define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask0(<2 x i32>* %vp, <8 x i32> define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask0(<2 x i32>* %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp @@ -495,8 +461,7 @@ define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask0(<2 x i32>* %vp, <8 x i3 define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask1(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp @@ -509,8 +474,7 @@ define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask1(<2 x i32>* %vp, <8 x i32> define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask1(<2 x i32>* %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp @@ -522,8 +486,7 @@ define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask1(<2 x i32>* %vp, <8 x i3 define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask2(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp @@ -536,8 +499,7 @@ define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask2(<2 x i32>* %vp, <8 x i32> define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask2(<2 x i32>* %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp @@ -549,8 +511,7 @@ define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask2(<2 x i32>* %vp, <8 x i3 define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask3(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp @@ -563,8 +524,7 @@ define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask3(<2 x i32>* %vp, <8 x i32> define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask3(<2 x i32>* %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp @@ -589,8 +549,7 @@ define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask0(<2 x i32>* %vp, <16 x i ; CHECK: # %bb.0: ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermd %zmm2, %zmm3, %zmm0 {%k1} ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp @@ -605,8 +564,7 @@ define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask0(<2 x i32>* %vp, <16 x ; CHECK: # %bb.0: ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm0, %k1 +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermd %zmm1, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp @@ -620,8 +578,7 @@ define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask1(<2 x i32>* %vp, <16 x i ; CHECK: # %bb.0: ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermd %zmm2, %zmm3, %zmm0 {%k1} ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp @@ -636,8 +593,7 @@ define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask1(<2 x i32>* %vp, <16 x ; CHECK: # %bb.0: ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm0, %k1 +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermd %zmm1, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp @@ -651,8 +607,7 @@ define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask2(<2 x i32>* %vp, <16 x i ; CHECK: # %bb.0: ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermd %zmm2, %zmm3, %zmm0 {%k1} ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp @@ -667,8 +622,7 @@ define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask2(<2 x i32>* %vp, <16 x ; CHECK: # %bb.0: ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm0, %k1 +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermd %zmm1, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp @@ -682,8 +636,7 @@ define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask3(<2 x i32>* %vp, <16 x i ; CHECK: # %bb.0: ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermd %zmm2, %zmm3, %zmm0 {%k1} ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp @@ -698,8 +651,7 @@ define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask3(<2 x i32>* %vp, <16 x ; CHECK: # %bb.0: ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm0, %k1 +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermd %zmm1, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp @@ -720,8 +672,7 @@ define <8 x i32> @test_4xi32_to_8xi32_mem(<4 x i32>* %vp) { define <8 x i32> @test_masked_4xi32_to_8xi32_mem_mask0(<4 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_4xi32_to_8xi32_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,0,1,2,3] ; CHECK-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %vp @@ -734,8 +685,7 @@ define <8 x i32> @test_masked_4xi32_to_8xi32_mem_mask0(<4 x i32>* %vp, <8 x i32> define <8 x i32> @test_masked_z_4xi32_to_8xi32_mem_mask0(<4 x i32>* %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_4xi32_to_8xi32_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3] ; CHECK-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %vp @@ -747,8 +697,7 @@ define <8 x i32> @test_masked_z_4xi32_to_8xi32_mem_mask0(<4 x i32>* %vp, <8 x i3 define <8 x i32> @test_masked_4xi32_to_8xi32_mem_mask1(<4 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_4xi32_to_8xi32_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,0,1,2,3] ; CHECK-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %vp @@ -761,8 +710,7 @@ define <8 x i32> @test_masked_4xi32_to_8xi32_mem_mask1(<4 x i32>* %vp, <8 x i32> define <8 x i32> @test_masked_z_4xi32_to_8xi32_mem_mask1(<4 x i32>* %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_4xi32_to_8xi32_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3] ; CHECK-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %vp @@ -774,8 +722,7 @@ define <8 x i32> @test_masked_z_4xi32_to_8xi32_mem_mask1(<4 x i32>* %vp, <8 x i3 define <8 x i32> @test_masked_4xi32_to_8xi32_mem_mask2(<4 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_4xi32_to_8xi32_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,0,1,2,3] ; CHECK-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %vp @@ -788,8 +735,7 @@ define <8 x i32> @test_masked_4xi32_to_8xi32_mem_mask2(<4 x i32>* %vp, <8 x i32> define <8 x i32> @test_masked_z_4xi32_to_8xi32_mem_mask2(<4 x i32>* %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_4xi32_to_8xi32_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3] ; CHECK-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %vp @@ -801,8 +747,7 @@ define <8 x i32> @test_masked_z_4xi32_to_8xi32_mem_mask2(<4 x i32>* %vp, <8 x i3 define <8 x i32> @test_masked_4xi32_to_8xi32_mem_mask3(<4 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_4xi32_to_8xi32_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,0,1,2,3] ; CHECK-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %vp @@ -815,8 +760,7 @@ define <8 x i32> @test_masked_4xi32_to_8xi32_mem_mask3(<4 x i32>* %vp, <8 x i32> define <8 x i32> @test_masked_z_4xi32_to_8xi32_mem_mask3(<4 x i32>* %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_4xi32_to_8xi32_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3] ; CHECK-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %vp @@ -837,8 +781,7 @@ define <16 x i32> @test_4xi32_to_16xi32_mem(<4 x i32>* %vp) { define <16 x i32> @test_masked_4xi32_to_16xi32_mem_mask0(<4 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_4xi32_to_16xi32_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; CHECK-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %vp @@ -851,8 +794,7 @@ define <16 x i32> @test_masked_4xi32_to_16xi32_mem_mask0(<4 x i32>* %vp, <16 x i define <16 x i32> @test_masked_z_4xi32_to_16xi32_mem_mask0(<4 x i32>* %vp, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_4xi32_to_16xi32_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; CHECK-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %vp @@ -864,8 +806,7 @@ define <16 x i32> @test_masked_z_4xi32_to_16xi32_mem_mask0(<4 x i32>* %vp, <16 x define <16 x i32> @test_masked_4xi32_to_16xi32_mem_mask1(<4 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_4xi32_to_16xi32_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; CHECK-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %vp @@ -878,8 +819,7 @@ define <16 x i32> @test_masked_4xi32_to_16xi32_mem_mask1(<4 x i32>* %vp, <16 x i define <16 x i32> @test_masked_z_4xi32_to_16xi32_mem_mask1(<4 x i32>* %vp, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_4xi32_to_16xi32_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; CHECK-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %vp @@ -891,8 +831,7 @@ define <16 x i32> @test_masked_z_4xi32_to_16xi32_mem_mask1(<4 x i32>* %vp, <16 x define <16 x i32> @test_masked_4xi32_to_16xi32_mem_mask2(<4 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_4xi32_to_16xi32_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; CHECK-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %vp @@ -905,8 +844,7 @@ define <16 x i32> @test_masked_4xi32_to_16xi32_mem_mask2(<4 x i32>* %vp, <16 x i define <16 x i32> @test_masked_z_4xi32_to_16xi32_mem_mask2(<4 x i32>* %vp, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_4xi32_to_16xi32_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; CHECK-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %vp @@ -918,8 +856,7 @@ define <16 x i32> @test_masked_z_4xi32_to_16xi32_mem_mask2(<4 x i32>* %vp, <16 x define <16 x i32> @test_masked_4xi32_to_16xi32_mem_mask3(<4 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_4xi32_to_16xi32_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; CHECK-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %vp @@ -932,8 +869,7 @@ define <16 x i32> @test_masked_4xi32_to_16xi32_mem_mask3(<4 x i32>* %vp, <16 x i define <16 x i32> @test_masked_z_4xi32_to_16xi32_mem_mask3(<4 x i32>* %vp, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_4xi32_to_16xi32_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; CHECK-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %vp @@ -954,8 +890,7 @@ define <4 x i64> @test_2xi64_to_4xi64_mem(<2 x i64>* %vp) { define <4 x i64> @test_masked_2xi64_to_4xi64_mem_mask0(<2 x i64>* %vp, <4 x i64> %default, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_2xi64_to_4xi64_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i64>, <2 x i64>* %vp @@ -968,8 +903,7 @@ define <4 x i64> @test_masked_2xi64_to_4xi64_mem_mask0(<2 x i64>* %vp, <4 x i64> define <4 x i64> @test_masked_z_2xi64_to_4xi64_mem_mask0(<2 x i64>* %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_2xi64_to_4xi64_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i64>, <2 x i64>* %vp @@ -981,8 +915,7 @@ define <4 x i64> @test_masked_z_2xi64_to_4xi64_mem_mask0(<2 x i64>* %vp, <4 x i6 define <4 x i64> @test_masked_2xi64_to_4xi64_mem_mask1(<2 x i64>* %vp, <4 x i64> %default, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_2xi64_to_4xi64_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i64>, <2 x i64>* %vp @@ -995,8 +928,7 @@ define <4 x i64> @test_masked_2xi64_to_4xi64_mem_mask1(<2 x i64>* %vp, <4 x i64> define <4 x i64> @test_masked_z_2xi64_to_4xi64_mem_mask1(<2 x i64>* %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_2xi64_to_4xi64_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i64>, <2 x i64>* %vp @@ -1008,8 +940,7 @@ define <4 x i64> @test_masked_z_2xi64_to_4xi64_mem_mask1(<2 x i64>* %vp, <4 x i6 define <4 x i64> @test_masked_2xi64_to_4xi64_mem_mask2(<2 x i64>* %vp, <4 x i64> %default, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_2xi64_to_4xi64_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i64>, <2 x i64>* %vp @@ -1022,8 +953,7 @@ define <4 x i64> @test_masked_2xi64_to_4xi64_mem_mask2(<2 x i64>* %vp, <4 x i64> define <4 x i64> @test_masked_z_2xi64_to_4xi64_mem_mask2(<2 x i64>* %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_2xi64_to_4xi64_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i64>, <2 x i64>* %vp @@ -1035,8 +965,7 @@ define <4 x i64> @test_masked_z_2xi64_to_4xi64_mem_mask2(<2 x i64>* %vp, <4 x i6 define <4 x i64> @test_masked_2xi64_to_4xi64_mem_mask3(<2 x i64>* %vp, <4 x i64> %default, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_2xi64_to_4xi64_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i64>, <2 x i64>* %vp @@ -1049,8 +978,7 @@ define <4 x i64> @test_masked_2xi64_to_4xi64_mem_mask3(<2 x i64>* %vp, <4 x i64> define <4 x i64> @test_masked_z_2xi64_to_4xi64_mem_mask3(<2 x i64>* %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_2xi64_to_4xi64_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i64>, <2 x i64>* %vp @@ -1071,8 +999,7 @@ define <8 x i64> @test_2xi64_to_8xi64_mem(<2 x i64>* %vp) { define <8 x i64> @test_masked_2xi64_to_8xi64_mem_mask0(<2 x i64>* %vp, <8 x i64> %default, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_2xi64_to_8xi64_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i64>, <2 x i64>* %vp @@ -1085,8 +1012,7 @@ define <8 x i64> @test_masked_2xi64_to_8xi64_mem_mask0(<2 x i64>* %vp, <8 x i64> define <8 x i64> @test_masked_z_2xi64_to_8xi64_mem_mask0(<2 x i64>* %vp, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_2xi64_to_8xi64_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i64>, <2 x i64>* %vp @@ -1098,8 +1024,7 @@ define <8 x i64> @test_masked_z_2xi64_to_8xi64_mem_mask0(<2 x i64>* %vp, <8 x i6 define <8 x i64> @test_masked_2xi64_to_8xi64_mem_mask1(<2 x i64>* %vp, <8 x i64> %default, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_2xi64_to_8xi64_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i64>, <2 x i64>* %vp @@ -1112,8 +1037,7 @@ define <8 x i64> @test_masked_2xi64_to_8xi64_mem_mask1(<2 x i64>* %vp, <8 x i64> define <8 x i64> @test_masked_z_2xi64_to_8xi64_mem_mask1(<2 x i64>* %vp, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_2xi64_to_8xi64_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i64>, <2 x i64>* %vp @@ -1125,8 +1049,7 @@ define <8 x i64> @test_masked_z_2xi64_to_8xi64_mem_mask1(<2 x i64>* %vp, <8 x i6 define <8 x i64> @test_masked_2xi64_to_8xi64_mem_mask2(<2 x i64>* %vp, <8 x i64> %default, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_2xi64_to_8xi64_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i64>, <2 x i64>* %vp @@ -1139,8 +1062,7 @@ define <8 x i64> @test_masked_2xi64_to_8xi64_mem_mask2(<2 x i64>* %vp, <8 x i64> define <8 x i64> @test_masked_z_2xi64_to_8xi64_mem_mask2(<2 x i64>* %vp, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_2xi64_to_8xi64_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i64>, <2 x i64>* %vp @@ -1152,8 +1074,7 @@ define <8 x i64> @test_masked_z_2xi64_to_8xi64_mem_mask2(<2 x i64>* %vp, <8 x i6 define <8 x i64> @test_masked_2xi64_to_8xi64_mem_mask3(<2 x i64>* %vp, <8 x i64> %default, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_2xi64_to_8xi64_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i64>, <2 x i64>* %vp @@ -1166,8 +1087,7 @@ define <8 x i64> @test_masked_2xi64_to_8xi64_mem_mask3(<2 x i64>* %vp, <8 x i64> define <8 x i64> @test_masked_z_2xi64_to_8xi64_mem_mask3(<2 x i64>* %vp, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_2xi64_to_8xi64_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i64>, <2 x i64>* %vp @@ -1188,8 +1108,7 @@ define <16 x i32> @test_8xi32_to_16xi32_mem(<8 x i32>* %vp) { define <16 x i32> @test_masked_8xi32_to_16xi32_mem_mask0(<8 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_to_16xi32_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp @@ -1202,8 +1121,7 @@ define <16 x i32> @test_masked_8xi32_to_16xi32_mem_mask0(<8 x i32>* %vp, <16 x i define <16 x i32> @test_masked_z_8xi32_to_16xi32_mem_mask0(<8 x i32>* %vp, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_16xi32_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp @@ -1215,8 +1133,7 @@ define <16 x i32> @test_masked_z_8xi32_to_16xi32_mem_mask0(<8 x i32>* %vp, <16 x define <16 x i32> @test_masked_8xi32_to_16xi32_mem_mask1(<8 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_to_16xi32_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp @@ -1229,8 +1146,7 @@ define <16 x i32> @test_masked_8xi32_to_16xi32_mem_mask1(<8 x i32>* %vp, <16 x i define <16 x i32> @test_masked_z_8xi32_to_16xi32_mem_mask1(<8 x i32>* %vp, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_16xi32_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp @@ -1242,8 +1158,7 @@ define <16 x i32> @test_masked_z_8xi32_to_16xi32_mem_mask1(<8 x i32>* %vp, <16 x define <16 x i32> @test_masked_8xi32_to_16xi32_mem_mask2(<8 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_to_16xi32_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp @@ -1256,8 +1171,7 @@ define <16 x i32> @test_masked_8xi32_to_16xi32_mem_mask2(<8 x i32>* %vp, <16 x i define <16 x i32> @test_masked_z_8xi32_to_16xi32_mem_mask2(<8 x i32>* %vp, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_16xi32_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp @@ -1269,8 +1183,7 @@ define <16 x i32> @test_masked_z_8xi32_to_16xi32_mem_mask2(<8 x i32>* %vp, <16 x define <16 x i32> @test_masked_8xi32_to_16xi32_mem_mask3(<8 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_to_16xi32_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp @@ -1283,8 +1196,7 @@ define <16 x i32> @test_masked_8xi32_to_16xi32_mem_mask3(<8 x i32>* %vp, <16 x i define <16 x i32> @test_masked_z_8xi32_to_16xi32_mem_mask3(<8 x i32>* %vp, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_16xi32_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp @@ -1305,8 +1217,7 @@ define <8 x i64> @test_4xi64_to_8xi64_mem(<4 x i64>* %vp) { define <8 x i64> @test_masked_4xi64_to_8xi64_mem_mask0(<4 x i64>* %vp, <8 x i64> %default, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_4xi64_to_8xi64_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3] ; CHECK-NEXT: retq %vec = load <4 x i64>, <4 x i64>* %vp @@ -1319,8 +1230,7 @@ define <8 x i64> @test_masked_4xi64_to_8xi64_mem_mask0(<4 x i64>* %vp, <8 x i64> define <8 x i64> @test_masked_z_4xi64_to_8xi64_mem_mask0(<4 x i64>* %vp, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_4xi64_to_8xi64_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3] ; CHECK-NEXT: retq %vec = load <4 x i64>, <4 x i64>* %vp @@ -1332,8 +1242,7 @@ define <8 x i64> @test_masked_z_4xi64_to_8xi64_mem_mask0(<4 x i64>* %vp, <8 x i6 define <8 x i64> @test_masked_4xi64_to_8xi64_mem_mask1(<4 x i64>* %vp, <8 x i64> %default, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_4xi64_to_8xi64_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3] ; CHECK-NEXT: retq %vec = load <4 x i64>, <4 x i64>* %vp @@ -1346,8 +1255,7 @@ define <8 x i64> @test_masked_4xi64_to_8xi64_mem_mask1(<4 x i64>* %vp, <8 x i64> define <8 x i64> @test_masked_z_4xi64_to_8xi64_mem_mask1(<4 x i64>* %vp, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_4xi64_to_8xi64_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3] ; CHECK-NEXT: retq %vec = load <4 x i64>, <4 x i64>* %vp @@ -1359,8 +1267,7 @@ define <8 x i64> @test_masked_z_4xi64_to_8xi64_mem_mask1(<4 x i64>* %vp, <8 x i6 define <8 x i64> @test_masked_4xi64_to_8xi64_mem_mask2(<4 x i64>* %vp, <8 x i64> %default, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_4xi64_to_8xi64_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3] ; CHECK-NEXT: retq %vec = load <4 x i64>, <4 x i64>* %vp @@ -1373,8 +1280,7 @@ define <8 x i64> @test_masked_4xi64_to_8xi64_mem_mask2(<4 x i64>* %vp, <8 x i64> define <8 x i64> @test_masked_z_4xi64_to_8xi64_mem_mask2(<4 x i64>* %vp, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_4xi64_to_8xi64_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3] ; CHECK-NEXT: retq %vec = load <4 x i64>, <4 x i64>* %vp @@ -1386,8 +1292,7 @@ define <8 x i64> @test_masked_z_4xi64_to_8xi64_mem_mask2(<4 x i64>* %vp, <8 x i6 define <8 x i64> @test_masked_4xi64_to_8xi64_mem_mask3(<4 x i64>* %vp, <8 x i64> %default, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_4xi64_to_8xi64_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3] ; CHECK-NEXT: retq %vec = load <4 x i64>, <4 x i64>* %vp @@ -1400,8 +1305,7 @@ define <8 x i64> @test_masked_4xi64_to_8xi64_mem_mask3(<4 x i64>* %vp, <8 x i64> define <8 x i64> @test_masked_z_4xi64_to_8xi64_mem_mask3(<4 x i64>* %vp, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_4xi64_to_8xi64_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3] ; CHECK-NEXT: retq %vec = load <4 x i64>, <4 x i64>* %vp diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll index abed2c04275..decaec05c67 100644 --- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -22,8 +22,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,10,11,0,1,2,3,12,13,0,1] ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3,4],xmm0[5,6,7] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 +; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -40,8 +39,7 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,10,11,0,1,2,3,12,13,0,1] ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3,4],xmm0[5,6,7] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -57,8 +55,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13] ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 +; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -75,8 +72,7 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13] ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -92,8 +88,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,12,13,4,5,0,1,2,3,12,13,2,3] ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3],xmm3[4,5,6],xmm0[7] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 +; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -110,8 +105,7 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,12,13,4,5,0,1,2,3,12,13,2,3] ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3],xmm2[4,5,6],xmm0[7] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -139,8 +133,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,10,11,8,9,8,9,0,1,2,3] ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4],xmm0[5,6],xmm3[7] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 +; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -157,8 +150,7 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,10,11,8,9,8,9,0,1,2,3] ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4],xmm0[5,6],xmm2[7] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -189,8 +181,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, <8 ; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3] ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7] ; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6],xmm3[7] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %xmm3, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -209,8 +200,7 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3] ; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7] ; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6],xmm2[7] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm0, %k1 +; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -229,8 +219,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, <8 ; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,14,15,8,9,14,15,0,1,2,3,0,1,12,13] ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,14,15,4,5,14,15,2,3,10,11,0,1,2,3] ; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3,4,5],xmm3[6,7] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %xmm3, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -249,8 +238,7 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,14,15,8,9,14,15,0,1,2,3,0,1,12,13] ; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,14,15,2,3,10,11,0,1,2,3] ; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3,4,5],xmm2[6,7] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm0, %k1 +; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -269,8 +257,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8 ; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2 ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,0,1,6,7,0,1,10,11,0,1,14,15,2,3] ; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %xmm3, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -289,8 +276,7 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, ; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1 ; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,6,7,0,1,10,11,0,1,14,15,2,3] ; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm0, %k1 +; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -321,8 +307,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, <8 ; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3 ; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %xmm3, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -340,8 +325,7 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, ; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2 ; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] ; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm0, %k1 +; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -369,8 +353,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2] ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4 -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpcmpeqw %ymm0, %ymm2, %k1 +; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18> @@ -384,8 +367,7 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <1 ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqw %ymm4, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq @@ -400,8 +382,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26] ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4 -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpcmpeqw %ymm0, %ymm2, %k1 +; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10> @@ -415,8 +396,7 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <1 ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqw %ymm4, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq @@ -431,8 +411,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15] ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4 -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpcmpeqw %ymm0, %ymm2, %k1 +; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31> @@ -446,8 +425,7 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <1 ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqw %ymm4, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq @@ -473,8 +451,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5] ; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4 -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpcmpeqw %ymm0, %ymm2, %k1 +; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5> @@ -488,8 +465,7 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <1 ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqw %ymm4, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm2 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq @@ -516,8 +492,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <22,27,7,10,13,21,5,14,u,u,u,u,u,u,u,u> ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4 -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpcmpeqw %xmm0, %xmm2, %k1 +; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -533,8 +508,7 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <22,27,7,10,13,21,5,14,u,u,u,u,u,u,u,u> ; CHECK-NEXT: vpermi2w %ymm0, %ymm2, %ymm3 -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpcmpeqw %xmm0, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -549,8 +523,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <1,21,27,10,8,19,14,5,u,u,u,u,u,u,u,u> ; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4 -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpcmpeqw %xmm0, %xmm2, %k1 +; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -566,8 +539,7 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <1,21,27,10,8,19,14,5,u,u,u,u,u,u,u,u> ; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm3 -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpcmpeqw %xmm0, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -582,8 +554,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <15,13,18,16,9,11,26,8,u,u,u,u,u,u,u,u> ; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4 -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpcmpeqw %xmm0, %xmm2, %k1 +; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -599,8 +570,7 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <15,13,18,16,9,11,26,8,u,u,u,u,u,u,u,u> ; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm3 -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpcmpeqw %xmm0, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -627,8 +597,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <17,0,23,10,1,8,7,30,u,u,u,u,u,u,u,u> ; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4 -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpcmpeqw %xmm0, %xmm2, %k1 +; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -644,8 +613,7 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <17,0,23,10,1,8,7,30,u,u,u,u,u,u,u,u> ; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm3 -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpcmpeqw %xmm0, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -673,8 +641,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp, ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] ; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm4 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vmovdqu16 %ymm4, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -690,8 +657,7 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqw %ymm4, %ymm0, %k1 +; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -709,8 +675,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask1(<32 x i16>* %vp, ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25] ; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm4 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vmovdqu16 %ymm4, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -726,8 +691,7 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask1(<32 x i16>* %vp ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqw %ymm4, %ymm0, %k1 +; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -745,8 +709,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask2(<32 x i16>* %vp, ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0] ; CHECK-NEXT: vpermi2w %ymm2, %ymm3, %ymm4 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vmovdqu16 %ymm4, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -762,8 +725,7 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask2(<32 x i16>* %vp ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqw %ymm4, %ymm0, %k1 +; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermi2w %ymm2, %ymm3, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -793,8 +755,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp, ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] ; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm4 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vmovdqu16 %ymm4, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -810,8 +771,7 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqw %ymm4, %ymm0, %k1 +; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -843,8 +803,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp, <8 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u> ; CHECK-NEXT: vpermi2w %ymm2, %ymm3, %ymm4 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm4, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -862,8 +821,7 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp, ; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u> ; CHECK-NEXT: vpermi2w %ymm1, %ymm2, %ymm3 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -881,8 +839,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp, <8 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <7,6,4,6,12,4,27,1,u,u,u,u,u,u,u,u> ; CHECK-NEXT: vpermi2w %ymm2, %ymm3, %ymm4 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm4, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -900,8 +857,7 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp, ; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <7,6,4,6,12,4,27,1,u,u,u,u,u,u,u,u> ; CHECK-NEXT: vpermi2w %ymm1, %ymm2, %ymm3 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -919,8 +875,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp, <8 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <6,18,0,4,10,25,22,10,u,u,u,u,u,u,u,u> ; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm4 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm4, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -938,8 +893,7 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp, ; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <6,18,0,4,10,25,22,10,u,u,u,u,u,u,u,u> ; CHECK-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -971,8 +925,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp, <8 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u> ; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm4 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm4, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -990,8 +943,7 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp, ; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u> ; CHECK-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1019,8 +971,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,2] ; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1036,8 +987,7 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,2] ; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1053,8 +1003,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32 ; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,0,2,3] ; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1071,8 +1020,7 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i ; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,0,2,3] ; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1086,8 +1034,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32 ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3 ; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm3[1],xmm0[1] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1102,8 +1049,7 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2 ; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm0[1] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1128,8 +1074,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32 ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3 ; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,3,2,1] ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper @@ -1145,8 +1090,7 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2 ; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,3,2,1] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1173,8 +1117,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp, <4 x ; CHECK-NEXT: vmovaps (%rdi), %ymm2 ; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3 ; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,1],xmm2[0,0] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1191,8 +1134,7 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp, <4 ; CHECK-NEXT: vmovaps (%rdi), %ymm1 ; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 ; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,1],xmm1[0,0] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %k1 +; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1209,8 +1151,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 x ; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3 ; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = xmm2[1,0,0,3] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1227,8 +1168,7 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2 ; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %k1 +; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm1[1,0,0,3] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1245,8 +1185,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 x ; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3 ; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = xmm2[0,3,3,0] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1263,8 +1202,7 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2 ; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %k1 +; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm1[0,3,3,0] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1297,8 +1235,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x ; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] ; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1317,8 +1254,7 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 ; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %k1 +; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1346,8 +1282,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [9,5,3,6,15,2,9,14] ; CHECK-NEXT: vpermi2d %ymm0, %ymm3, %ymm4 -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpcmpeqd %ymm0, %ymm2, %k1 +; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmd %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6> @@ -1361,8 +1296,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [9,5,3,6,15,2,9,14] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqd %ymm4, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermi2d %ymm0, %ymm3, %ymm2 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq @@ -1377,8 +1311,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [3,0,15,3,2,3,6,8] ; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4 -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpcmpeqd %ymm0, %ymm2, %k1 +; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmd %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 3, i32 0, i32 15, i32 3, i32 2, i32 3, i32 6, i32 8> @@ -1392,8 +1325,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,15,3,2,3,6,8] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqd %ymm4, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm2 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq @@ -1408,8 +1340,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,15,15,2,6,10,14,7] ; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4 -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpcmpeqd %ymm0, %ymm2, %k1 +; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmd %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7> @@ -1423,8 +1354,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,15,15,2,6,10,14,7] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqd %ymm4, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm2 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq @@ -1450,8 +1380,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [14,5,7,7,10,3,9,3] ; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4 -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpcmpeqd %ymm0, %ymm2, %k1 +; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmd %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3> @@ -1465,8 +1394,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [14,5,7,7,10,3,9,3] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqd %ymm4, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm2 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq @@ -1493,8 +1421,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,12,4,6,4,12] ; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4 -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpcmpeqd %xmm0, %xmm2, %k1 +; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmd %xmm4, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1510,8 +1437,7 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,12,4,6,4,12] ; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm3 -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpcmpeqd %xmm0, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1526,8 +1452,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <5,1,3,4,u,u,u,u> ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0 -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1543,8 +1468,7 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <5,1,3,4,u,u,u,u> ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1559,8 +1483,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <1,1,13,0,u,u,u,u> ; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4 -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpcmpeqd %xmm0, %xmm2, %k1 +; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmd %xmm4, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1576,8 +1499,7 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <1,1,13,0,u,u,u,u> ; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm3 -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpcmpeqd %xmm0, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1604,8 +1526,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <3,0,0,13,u,u,u,u> ; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4 -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpcmpeqd %xmm0, %xmm2, %k1 +; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmd %xmm4, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1621,8 +1542,7 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <3,0,0,13,u,u,u,u> ; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm3 -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpcmpeqd %xmm0, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1645,8 +1565,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp, <8 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [7,0,6,0,1,2,4,4] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd 32(%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -1660,8 +1579,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp, ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,0,6,0,1,2,4,4] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 +; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermd 32(%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -1678,8 +1596,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask1(<16 x i32>* %vp, <8 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [7,3,6,11,0,1,5,15] ; CHECK-NEXT: vpermi2d %ymm2, %ymm3, %ymm4 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vmovdqa32 %ymm4, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -1695,8 +1612,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask1(<16 x i32>* %vp, ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,3,6,11,0,1,5,15] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqd %ymm4, %ymm0, %k1 +; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermi2d %ymm2, %ymm3, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -1714,8 +1630,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask2(<16 x i32>* %vp, <8 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [4,14,1,5,4,2,8,10] ; CHECK-NEXT: vpermi2d %ymm2, %ymm3, %ymm4 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vmovdqa32 %ymm4, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -1731,8 +1646,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask2(<16 x i32>* %vp, ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,1,5,4,2,8,10] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqd %ymm4, %ymm0, %k1 +; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermi2d %ymm2, %ymm3, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -1762,8 +1676,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp, <8 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [8,4,1,13,15,4,6,12] ; CHECK-NEXT: vpermi2d %ymm3, %ymm2, %ymm4 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vmovdqa32 %ymm4, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -1779,8 +1692,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp, ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [8,4,1,13,15,4,6,12] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqd %ymm4, %ymm0, %k1 +; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermi2d %ymm3, %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -1812,8 +1724,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp, <4 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <13,0,0,6,u,u,u,u> ; CHECK-NEXT: vpermi2d %ymm3, %ymm2, %ymm4 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm4, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1831,8 +1742,7 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp, ; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <13,0,0,6,u,u,u,u> ; CHECK-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1850,8 +1760,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp, <4 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [15,5,3,2,15,5,7,6] ; CHECK-NEXT: vpermi2d %ymm2, %ymm3, %ymm4 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm4, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1869,8 +1778,7 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp, ; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [15,5,3,2,15,5,7,6] ; CHECK-NEXT: vpermi2d %ymm1, %ymm2, %ymm3 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1888,8 +1796,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp, <4 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <2,15,6,9,u,u,u,u> ; CHECK-NEXT: vpermi2d %ymm3, %ymm2, %ymm4 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm4, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1907,8 +1814,7 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp, ; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <2,15,6,9,u,u,u,u> ; CHECK-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1949,8 +1855,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4 ; CHECK-NEXT: vpinsrd $2, %eax, %xmm4, %xmm3 ; CHECK-NEXT: vpextrd $2, %xmm2, %eax ; CHECK-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2 -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1973,8 +1878,7 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, ; CHECK-NEXT: vpinsrd $2, %eax, %xmm3, %xmm2 ; CHECK-NEXT: vpextrd $2, %xmm1, %eax ; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %k1 +; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1999,8 +1903,7 @@ define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3 -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqq %xmm4, %xmm2, %k1 +; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm0[0] ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper @@ -2015,8 +1918,7 @@ define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2 -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %xmm3, %xmm1, %k1 +; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -2030,8 +1932,7 @@ define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64 ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3 ; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 +; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -2046,8 +1947,7 @@ define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2 ; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -2073,8 +1973,7 @@ define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp, <2 x ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3 -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqq %xmm4, %xmm1, %k1 +; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm3[1] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -2090,8 +1989,7 @@ define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp, <2 ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2 -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %xmm3, %xmm0, %k1 +; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm2[1] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -2108,8 +2006,7 @@ define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask1(<4 x i64>* %vp, <2 x ; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3 ; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %xmm3, %xmm1, %k1 +; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -2126,8 +2023,7 @@ define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask1(<4 x i64>* %vp, <2 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2 ; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %xmm2, %xmm0, %k1 +; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -2151,8 +2047,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,1] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -2166,8 +2061,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,1] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5> @@ -2181,8 +2075,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,0,2,5] ; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4 -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1 +; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1> @@ -2196,8 +2089,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,0,2,5] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq @@ -2212,8 +2104,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,7,2,7] ; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4 -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1 +; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3> @@ -2227,8 +2118,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,7,2,7] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq @@ -2254,8 +2144,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,4,4,3] ; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4 -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1 +; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7> @@ -2269,8 +2158,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,4,4,3] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq @@ -2285,8 +2173,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [7,3,3,1] ; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4 -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1 +; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5> @@ -2300,8 +2187,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [7,3,3,1] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq @@ -2316,8 +2202,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [4,1,0,6] ; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm4 -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1 +; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6> @@ -2331,8 +2216,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [4,1,0,6] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm2 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq @@ -2358,8 +2242,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [3,2,1,7] ; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4 -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1 +; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3> @@ -2373,8 +2256,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,7] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq @@ -2389,8 +2271,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,0,3,4] ; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm4 -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1 +; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4> @@ -2404,8 +2285,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,0,3,4] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm2 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq @@ -2430,8 +2310,7 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64 ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3 ; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqq %xmm4, %xmm2, %k1 +; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm0[0] ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper @@ -2447,8 +2326,7 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2 ; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %xmm3, %xmm1, %k1 +; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -2462,8 +2340,7 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64 ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 +; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -2478,8 +2355,7 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -2500,8 +2376,7 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp) { define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[0,2,0,2] ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -2514,8 +2389,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp, <4 x define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,2,0,2] ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -2532,8 +2406,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(<8 x i64>* %vp, <4 x ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [4,3,2,4] ; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm4 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -2549,8 +2422,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(<8 x i64>* %vp, <4 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,4] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1 +; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -2568,8 +2440,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [3,5,5,1] ; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm4 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -2585,8 +2456,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,5,5,1] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1 +; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -2616,8 +2486,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp, <4 x ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [7,0,0,2] ; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm4 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -2633,8 +2502,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp, <4 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,0,0,2] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1 +; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -2652,8 +2520,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [0,4,6,1] ; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm4 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -2669,8 +2536,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,6,1] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1 +; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -2688,8 +2554,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4 x ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,7,1] ; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm4 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -2705,8 +2570,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,7,1] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1 +; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -2736,8 +2600,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [7,2,3,2] ; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm4 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -2753,8 +2616,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,2,3,2] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1 +; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -2772,8 +2634,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(<8 x i64>* %vp, <4 x ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [3,3,1,5] ; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm4 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -2789,8 +2650,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(<8 x i64>* %vp, <4 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,1,5] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1 +; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -2820,8 +2680,7 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp, <2 x ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 ; CHECK-NEXT: vextracti32x4 $2, %zmm2, %xmm3 ; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqq %xmm4, %xmm1, %k1 +; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm2[0] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -2838,8 +2697,7 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp, <2 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 ; CHECK-NEXT: vextracti32x4 $2, %zmm1, %xmm2 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %xmm3, %xmm0, %k1 +; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm1[0] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -2857,8 +2715,7 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(<8 x i64>* %vp, <2 x ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2 -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %xmm3, %xmm1, %k1 +; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -2876,8 +2733,7 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask1(<8 x i64>* %vp, <2 ; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %xmm2, %xmm0, %k1 +; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/permute.ll index ac619279aed..5be6ab87461 100644 --- a/llvm/test/CodeGen/X86/avx512-shuffles/permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/permute.ll @@ -14,8 +14,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %ve ; CHECK-LABEL: test_masked_16xi16_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 +; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -29,8 +28,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> % ; CHECK-LABEL: test_masked_z_16xi16_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14> @@ -42,8 +40,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %ve ; CHECK-LABEL: test_masked_16xi16_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 +; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -57,8 +54,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> % ; CHECK-LABEL: test_masked_z_16xi16_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0> @@ -70,8 +66,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %ve ; CHECK-LABEL: test_masked_16xi16_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 +; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -85,8 +80,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> % ; CHECK-LABEL: test_masked_z_16xi16_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7> @@ -107,8 +101,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %ve ; CHECK-LABEL: test_masked_16xi16_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 +; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -122,8 +115,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> % ; CHECK-LABEL: test_masked_z_16xi16_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6> @@ -145,8 +137,7 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i16> ; CHECK-LABEL: test_masked_16xi16_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp @@ -160,8 +151,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i1 ; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 +; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp @@ -175,8 +165,7 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i16> ; CHECK-LABEL: test_masked_16xi16_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp @@ -190,8 +179,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i1 ; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 +; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp @@ -205,8 +193,7 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i16> ; CHECK-LABEL: test_masked_16xi16_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp @@ -220,8 +207,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i1 ; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 +; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp @@ -245,8 +231,7 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i16> ; CHECK-LABEL: test_masked_16xi16_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp @@ -260,8 +245,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i1 ; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 +; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp @@ -284,8 +268,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %ve ; CHECK-LABEL: test_masked_32xi16_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 +; CHECK-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -299,8 +282,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> % ; CHECK-LABEL: test_masked_z_32xi16_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 +; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10> @@ -312,8 +294,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %ve ; CHECK-LABEL: test_masked_32xi16_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 +; CHECK-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -327,8 +308,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> % ; CHECK-LABEL: test_masked_z_32xi16_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 +; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 8, i32 7, i32 30, i32 11, i32 9, i32 11, i32 30, i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12, i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16> @@ -340,8 +320,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %ve ; CHECK-LABEL: test_masked_32xi16_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 +; CHECK-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -355,8 +334,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> % ; CHECK-LABEL: test_masked_z_32xi16_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 +; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25, i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16, i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27> @@ -377,8 +355,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %ve ; CHECK-LABEL: test_masked_32xi16_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 +; CHECK-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -392,8 +369,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> % ; CHECK-LABEL: test_masked_z_32xi16_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 +; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4> @@ -415,8 +391,7 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16> ; CHECK-LABEL: test_masked_32xi16_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 +; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -430,8 +405,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i1 ; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 +; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -445,8 +419,7 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16> ; CHECK-LABEL: test_masked_32xi16_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 +; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -460,8 +433,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i1 ; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 +; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -475,8 +447,7 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16> ; CHECK-LABEL: test_masked_32xi16_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 +; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -490,8 +461,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i1 ; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 +; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -515,8 +485,7 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16> ; CHECK-LABEL: test_masked_32xi16_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 +; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -530,8 +499,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i1 ; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 +; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -554,8 +522,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, ; CHECK-LABEL: test_masked_8xi32_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 +; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -569,8 +536,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask ; CHECK-LABEL: test_masked_z_8xi32_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6> @@ -582,8 +548,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, ; CHECK-LABEL: test_masked_8xi32_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 +; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -597,8 +562,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask ; CHECK-LABEL: test_masked_z_8xi32_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 5, i32 1, i32 2, i32 6, i32 0, i32 0, i32 3> @@ -610,8 +574,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, ; CHECK-LABEL: test_masked_8xi32_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 +; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -625,8 +588,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask ; CHECK-LABEL: test_masked_z_8xi32_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 6, i32 5, i32 5, i32 1, i32 7, i32 3, i32 4> @@ -647,8 +609,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, ; CHECK-LABEL: test_masked_8xi32_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 +; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -662,8 +623,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask ; CHECK-LABEL: test_masked_z_8xi32_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0> @@ -685,8 +645,7 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %ve ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp @@ -700,8 +659,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> % ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 +; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp @@ -715,8 +673,7 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %ve ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp @@ -730,8 +687,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> % ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 +; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp @@ -745,8 +701,7 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %ve ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp @@ -760,8 +715,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> % ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 +; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp @@ -785,8 +739,7 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %ve ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp @@ -800,8 +753,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> % ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 +; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp @@ -824,8 +776,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %ve ; CHECK-LABEL: test_masked_16xi32_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 +; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -839,8 +790,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> % ; CHECK-LABEL: test_masked_z_16xi32_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7> @@ -852,8 +802,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %ve ; CHECK-LABEL: test_masked_16xi32_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 +; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -867,8 +816,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> % ; CHECK-LABEL: test_masked_z_16xi32_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 10, i32 0, i32 14, i32 15, i32 11, i32 1, i32 1, i32 5, i32 0, i32 5, i32 0, i32 15, i32 13, i32 1, i32 14, i32 3> @@ -880,8 +828,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %ve ; CHECK-LABEL: test_masked_16xi32_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 +; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -895,8 +842,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> % ; CHECK-LABEL: test_masked_z_16xi32_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 10, i32 15, i32 1, i32 0, i32 5, i32 0, i32 9, i32 13, i32 2, i32 1, i32 5, i32 15, i32 2, i32 15, i32 5> @@ -917,8 +863,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %ve ; CHECK-LABEL: test_masked_16xi32_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 +; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -932,8 +877,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> % ; CHECK-LABEL: test_masked_z_16xi32_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12> @@ -955,8 +899,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -970,8 +913,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i3 ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -985,8 +927,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -1000,8 +941,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i3 ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -1015,8 +955,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -1030,8 +969,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i3 ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -1055,8 +993,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -1070,8 +1007,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i3 ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -1092,8 +1028,7 @@ define <4 x i64> @test_4xi64_perm_mask0(<4 x i64> %vec) { define <4 x i64> @test_masked_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_4xi64_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -1106,8 +1041,7 @@ define <4 x i64> @test_masked_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %vec2, define <4 x i64> @test_masked_z_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_4xi64_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1] ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1> @@ -1118,8 +1052,7 @@ define <4 x i64> @test_masked_z_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %mask define <4 x i64> @test_masked_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_4xi64_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -1132,8 +1065,7 @@ define <4 x i64> @test_masked_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %vec2, define <4 x i64> @test_masked_z_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_4xi64_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3] ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3> @@ -1144,8 +1076,7 @@ define <4 x i64> @test_masked_z_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %mask define <4 x i64> @test_masked_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_4xi64_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -1158,8 +1089,7 @@ define <4 x i64> @test_masked_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %vec2, define <4 x i64> @test_masked_z_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_4xi64_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1] ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 1> @@ -1178,8 +1108,7 @@ define <4 x i64> @test_4xi64_perm_mask3(<4 x i64> %vec) { define <4 x i64> @test_masked_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_4xi64_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -1192,8 +1121,7 @@ define <4 x i64> @test_masked_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %vec2, define <4 x i64> @test_masked_z_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_4xi64_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3] ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3> @@ -1213,8 +1141,7 @@ define <4 x i64> @test_4xi64_perm_mem_mask0(<4 x i64>* %vp) { define <4 x i64> @test_masked_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_4xi64_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0] ; CHECK-NEXT: retq %vec = load <4 x i64>, <4 x i64>* %vp @@ -1227,8 +1154,7 @@ define <4 x i64> @test_masked_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %ve define <4 x i64> @test_masked_z_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0] ; CHECK-NEXT: retq %vec = load <4 x i64>, <4 x i64>* %vp @@ -1241,8 +1167,7 @@ define <4 x i64> @test_masked_z_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> % define <4 x i64> @test_masked_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_4xi64_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1] ; CHECK-NEXT: retq %vec = load <4 x i64>, <4 x i64>* %vp @@ -1255,8 +1180,7 @@ define <4 x i64> @test_masked_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %ve define <4 x i64> @test_masked_z_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1] ; CHECK-NEXT: retq %vec = load <4 x i64>, <4 x i64>* %vp @@ -1269,8 +1193,7 @@ define <4 x i64> @test_masked_z_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> % define <4 x i64> @test_masked_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_4xi64_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0] ; CHECK-NEXT: retq %vec = load <4 x i64>, <4 x i64>* %vp @@ -1283,8 +1206,7 @@ define <4 x i64> @test_masked_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %ve define <4 x i64> @test_masked_z_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0] ; CHECK-NEXT: retq %vec = load <4 x i64>, <4 x i64>* %vp @@ -1306,8 +1228,7 @@ define <4 x i64> @test_4xi64_perm_mem_mask3(<4 x i64>* %vp) { define <4 x i64> @test_masked_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_4xi64_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3] ; CHECK-NEXT: retq %vec = load <4 x i64>, <4 x i64>* %vp @@ -1320,8 +1241,7 @@ define <4 x i64> @test_masked_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %ve define <4 x i64> @test_masked_z_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3] ; CHECK-NEXT: retq %vec = load <4 x i64>, <4 x i64>* %vp @@ -1344,8 +1264,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %vec2, ; CHECK-LABEL: test_masked_8xi64_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 +; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -1359,8 +1278,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %mask ; CHECK-LABEL: test_masked_z_8xi64_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6> @@ -1371,8 +1289,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %mask define <8 x i64> @test_masked_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_perm_imm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -1385,8 +1302,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %ve define <8 x i64> @test_masked_z_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 1, i32 5, i32 4, i32 5, i32 5> @@ -1398,8 +1314,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %vec2, ; CHECK-LABEL: test_masked_8xi64_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 +; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -1413,8 +1328,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %mask ; CHECK-LABEL: test_masked_z_8xi64_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 3, i32 3, i32 5, i32 4, i32 1> @@ -1433,8 +1347,7 @@ define <8 x i64> @test_8xi64_perm_imm_mask3(<8 x i64> %vec) { define <8 x i64> @test_masked_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_perm_imm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -1447,8 +1360,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %ve define <8 x i64> @test_masked_z_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5> @@ -1460,8 +1372,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %vec2, ; CHECK-LABEL: test_masked_8xi64_perm_mask4: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 +; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -1475,8 +1386,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %mask ; CHECK-LABEL: test_masked_z_8xi64_perm_mask4: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 6, i32 3, i32 1, i32 1, i32 7, i32 4, i32 0, i32 3> @@ -1487,8 +1397,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %mask define <8 x i64> @test_masked_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_perm_imm_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -1501,8 +1410,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %ve define <8 x i64> @test_masked_z_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> @@ -1523,8 +1431,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %vec2, ; CHECK-LABEL: test_masked_8xi64_perm_mask6: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 +; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -1538,8 +1445,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %mask ; CHECK-LABEL: test_masked_z_8xi64_perm_mask6: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7> @@ -1550,8 +1456,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %mask define <8 x i64> @test_masked_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_perm_imm_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -1564,8 +1469,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %ve define <8 x i64> @test_masked_z_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7> @@ -1587,8 +1491,7 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %ve ; CHECK-LABEL: test_masked_8xi64_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -1602,8 +1505,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> % ; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 +; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -1616,8 +1518,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> % define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4] ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -1630,8 +1531,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4] ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -1645,8 +1545,7 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %ve ; CHECK-LABEL: test_masked_8xi64_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -1660,8 +1559,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> % ; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 +; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -1683,8 +1581,7 @@ define <8 x i64> @test_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp) { define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5] ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -1697,8 +1594,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5] ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -1712,8 +1608,7 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %ve ; CHECK-LABEL: test_masked_8xi64_perm_mem_mask4: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -1727,8 +1622,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> % ; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask4: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 +; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -1741,8 +1635,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> % define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4] ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -1755,8 +1648,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4] ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -1780,8 +1672,7 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %ve ; CHECK-LABEL: test_masked_8xi64_perm_mem_mask6: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -1795,8 +1686,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> % ; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask6: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 +; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -1809,8 +1699,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> % define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5] ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -1823,8 +1712,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5] ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll index 1896356dafa..d4f12747028 100644 --- a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll @@ -1030,8 +1030,7 @@ define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) { define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_masked_shuff_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; CHECK-NEXT: vptestnmd %ymm3, %ymm3, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq @@ -1044,8 +1043,7 @@ define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2 define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> @@ -1056,8 +1054,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_masked_shuff_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; CHECK-NEXT: vptestnmd %ymm3, %ymm3, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq @@ -1070,8 +1067,7 @@ define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2 define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> @@ -1082,8 +1078,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_masked_shuff_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; CHECK-NEXT: vptestnmd %ymm3, %ymm3, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq @@ -1096,8 +1091,7 @@ define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2 define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> @@ -1116,8 +1110,7 @@ define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) { define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_masked_shuff_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; CHECK-NEXT: vptestnmd %ymm3, %ymm3, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq @@ -1130,8 +1123,7 @@ define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2 define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> @@ -1151,8 +1143,7 @@ define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -1166,8 +1157,7 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p @@ -1180,8 +1170,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -1195,8 +1184,7 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p @@ -1209,8 +1197,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -1224,8 +1211,7 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p @@ -1247,8 +1233,7 @@ define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -1262,8 +1247,7 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p @@ -1284,8 +1268,7 @@ define <16 x i32> @test_16xi32_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2) { define <16 x i32> @test_16xi32_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) { ; CHECK-LABEL: test_16xi32_masked_shuff_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; CHECK-NEXT: vptestnmd %zmm3, %zmm3, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq @@ -1298,8 +1281,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> % define <16 x i32> @test_16xi32_zero_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) { ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31> @@ -1310,8 +1292,7 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mask0(<16 x i32> %vec1, <16 x i define <16 x i32> @test_16xi32_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) { ; CHECK-LABEL: test_16xi32_masked_shuff_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; CHECK-NEXT: vptestnmd %zmm3, %zmm3, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq @@ -1324,8 +1305,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> % define <16 x i32> @test_16xi32_zero_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) { ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23> @@ -1336,8 +1316,7 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mask1(<16 x i32> %vec1, <16 x i define <16 x i32> @test_16xi32_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) { ; CHECK-LABEL: test_16xi32_masked_shuff_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; CHECK-NEXT: vptestnmd %zmm3, %zmm3, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq @@ -1350,8 +1329,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> % define <16 x i32> @test_16xi32_zero_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) { ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19> @@ -1370,8 +1348,7 @@ define <16 x i32> @test_16xi32_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2) { define <16 x i32> @test_16xi32_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) { ; CHECK-LABEL: test_16xi32_masked_shuff_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; CHECK-NEXT: vptestnmd %zmm3, %zmm3, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq @@ -1384,8 +1361,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> % define <16 x i32> @test_16xi32_zero_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) { ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23> @@ -1405,8 +1381,7 @@ define <16 x i32> @test_16xi32_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %ve define <16 x i32> @test_16xi32_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) { ; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -1420,8 +1395,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i3 define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) { ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] ; CHECK-NEXT: retq %vec2 = load <16 x i32>, <16 x i32>* %vec2p @@ -1434,8 +1408,7 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 define <16 x i32> @test_16xi32_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) { ; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -1449,8 +1422,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i3 define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) { ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] ; CHECK-NEXT: retq %vec2 = load <16 x i32>, <16 x i32>* %vec2p @@ -1463,8 +1435,7 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 define <16 x i32> @test_16xi32_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) { ; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -1478,8 +1449,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i3 define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) { ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] ; CHECK-NEXT: retq %vec2 = load <16 x i32>, <16 x i32>* %vec2p @@ -1501,8 +1471,7 @@ define <16 x i32> @test_16xi32_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %ve define <16 x i32> @test_16xi32_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) { ; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -1516,8 +1485,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i3 define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) { ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] ; CHECK-NEXT: retq %vec2 = load <16 x i32>, <16 x i32>* %vec2p @@ -1538,8 +1506,7 @@ define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) { define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_masked_shuff_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; CHECK-NEXT: vptestnmq %ymm3, %ymm3, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq @@ -1552,8 +1519,7 @@ define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2 define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> @@ -1564,8 +1530,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_masked_shuff_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; CHECK-NEXT: vptestnmq %ymm3, %ymm3, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq @@ -1578,8 +1543,7 @@ define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2 define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> @@ -1590,8 +1554,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_masked_shuff_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; CHECK-NEXT: vptestnmq %ymm3, %ymm3, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq @@ -1604,8 +1567,7 @@ define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2 define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> @@ -1624,8 +1586,7 @@ define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) { define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_masked_shuff_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; CHECK-NEXT: vptestnmq %ymm3, %ymm3, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq @@ -1638,8 +1599,7 @@ define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2 define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> @@ -1659,8 +1619,7 @@ define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -1674,8 +1633,7 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p @@ -1688,8 +1646,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -1703,8 +1660,7 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p @@ -1717,8 +1673,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -1732,8 +1687,7 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p @@ -1755,8 +1709,7 @@ define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -1770,8 +1723,7 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p @@ -1792,8 +1744,7 @@ define <8 x i64> @test_8xi64_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2) { define <8 x i64> @test_8xi64_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) { ; CHECK-LABEL: test_8xi64_masked_shuff_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; CHECK-NEXT: vptestnmq %zmm3, %zmm3, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq @@ -1806,8 +1757,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2 define <8 x i64> @test_8xi64_zero_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) { ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13> @@ -1818,8 +1768,7 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> define <8 x i64> @test_8xi64_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) { ; CHECK-LABEL: test_8xi64_masked_shuff_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; CHECK-NEXT: vptestnmq %zmm3, %zmm3, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5] ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq @@ -1832,8 +1781,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2 define <8 x i64> @test_8xi64_zero_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) { ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 12, i32 13> @@ -1844,8 +1792,7 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> define <8 x i64> @test_8xi64_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) { ; CHECK-LABEL: test_8xi64_masked_shuff_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; CHECK-NEXT: vptestnmq %zmm3, %zmm3, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1] ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq @@ -1858,8 +1805,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2 define <8 x i64> @test_8xi64_zero_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) { ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9> @@ -1878,8 +1824,7 @@ define <8 x i64> @test_8xi64_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2) { define <8 x i64> @test_8xi64_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) { ; CHECK-LABEL: test_8xi64_masked_shuff_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; CHECK-NEXT: vptestnmq %zmm3, %zmm3, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3] ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq @@ -1892,8 +1837,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2 define <8 x i64> @test_8xi64_zero_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) { ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11> @@ -1913,8 +1857,7 @@ define <8 x i64> @test_8xi64_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p) define <8 x i64> @test_8xi64_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) { ; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -1928,8 +1871,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) { ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3] ; CHECK-NEXT: retq %vec2 = load <8 x i64>, <8 x i64>* %vec2p @@ -1942,8 +1884,7 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i define <8 x i64> @test_8xi64_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) { ; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -1957,8 +1898,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) { ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1] ; CHECK-NEXT: retq %vec2 = load <8 x i64>, <8 x i64>* %vec2p @@ -1971,8 +1911,7 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i define <8 x i64> @test_8xi64_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) { ; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -1986,8 +1925,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) { ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3] ; CHECK-NEXT: retq %vec2 = load <8 x i64>, <8 x i64>* %vec2p @@ -2009,8 +1947,7 @@ define <8 x i64> @test_8xi64_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p) define <8 x i64> @test_8xi64_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) { ; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -2024,8 +1961,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) { ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3] ; CHECK-NEXT: retq %vec2 = load <8 x i64>, <8 x i64>* %vec2p diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle.ll b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle.ll index df46487d9ab..67de50a83a6 100644 --- a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle.ll @@ -12,8 +12,7 @@ define <16 x i8> @test_16xi8_perm_mask0(<16 x i8> %vec) { define <16 x i8> @test_masked_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) { ; CHECK-LABEL: test_masked_16xi8_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 +; CHECK-NEXT: vptestnmb %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -26,8 +25,7 @@ define <16 x i8> @test_masked_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %vec2, define <16 x i8> @test_masked_z_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %mask) { ; CHECK-LABEL: test_masked_z_16xi8_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmb %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] ; CHECK-NEXT: retq %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14> @@ -38,8 +36,7 @@ define <16 x i8> @test_masked_z_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %mask define <16 x i8> @test_masked_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) { ; CHECK-LABEL: test_masked_16xi8_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 +; CHECK-NEXT: vptestnmb %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -52,8 +49,7 @@ define <16 x i8> @test_masked_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %vec2, define <16 x i8> @test_masked_z_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %mask) { ; CHECK-LABEL: test_masked_z_16xi8_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmb %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] ; CHECK-NEXT: retq %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0> @@ -64,8 +60,7 @@ define <16 x i8> @test_masked_z_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %mask define <16 x i8> @test_masked_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) { ; CHECK-LABEL: test_masked_16xi8_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 +; CHECK-NEXT: vptestnmb %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -78,8 +73,7 @@ define <16 x i8> @test_masked_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %vec2, define <16 x i8> @test_masked_z_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %mask) { ; CHECK-LABEL: test_masked_z_16xi8_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmb %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] ; CHECK-NEXT: retq %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7> @@ -98,8 +92,7 @@ define <16 x i8> @test_16xi8_perm_mask3(<16 x i8> %vec) { define <16 x i8> @test_masked_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) { ; CHECK-LABEL: test_masked_16xi8_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 +; CHECK-NEXT: vptestnmb %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -112,8 +105,7 @@ define <16 x i8> @test_masked_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %vec2, define <16 x i8> @test_masked_z_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %mask) { ; CHECK-LABEL: test_masked_z_16xi8_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmb %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] ; CHECK-NEXT: retq %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6> @@ -135,8 +127,7 @@ define <16 x i8> @test_masked_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> %ve ; CHECK-LABEL: test_masked_16xi8_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm2 -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 +; CHECK-NEXT: vptestnmb %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] ; CHECK-NEXT: retq %vec = load <16 x i8>, <16 x i8>* %vp @@ -150,8 +141,7 @@ define <16 x i8> @test_masked_z_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> % ; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm1 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 +; CHECK-NEXT: vptestnmb %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] ; CHECK-NEXT: retq %vec = load <16 x i8>, <16 x i8>* %vp @@ -165,8 +155,7 @@ define <16 x i8> @test_masked_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> %ve ; CHECK-LABEL: test_masked_16xi8_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm2 -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 +; CHECK-NEXT: vptestnmb %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] ; CHECK-NEXT: retq %vec = load <16 x i8>, <16 x i8>* %vp @@ -180,8 +169,7 @@ define <16 x i8> @test_masked_z_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> % ; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm1 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 +; CHECK-NEXT: vptestnmb %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] ; CHECK-NEXT: retq %vec = load <16 x i8>, <16 x i8>* %vp @@ -195,8 +183,7 @@ define <16 x i8> @test_masked_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> %ve ; CHECK-LABEL: test_masked_16xi8_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm2 -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 +; CHECK-NEXT: vptestnmb %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] ; CHECK-NEXT: retq %vec = load <16 x i8>, <16 x i8>* %vp @@ -210,8 +197,7 @@ define <16 x i8> @test_masked_z_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> % ; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm1 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 +; CHECK-NEXT: vptestnmb %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] ; CHECK-NEXT: retq %vec = load <16 x i8>, <16 x i8>* %vp @@ -235,8 +221,7 @@ define <16 x i8> @test_masked_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> %ve ; CHECK-LABEL: test_masked_16xi8_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm2 -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 +; CHECK-NEXT: vptestnmb %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] ; CHECK-NEXT: retq %vec = load <16 x i8>, <16 x i8>* %vp @@ -250,8 +235,7 @@ define <16 x i8> @test_masked_z_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> % ; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm1 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 +; CHECK-NEXT: vptestnmb %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] ; CHECK-NEXT: retq %vec = load <16 x i8>, <16 x i8>* %vp @@ -272,8 +256,7 @@ define <32 x i8> @test_32xi8_perm_mask0(<32 x i8> %vec) { define <32 x i8> @test_masked_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) { ; CHECK-LABEL: test_masked_32xi8_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmb %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -286,8 +269,7 @@ define <32 x i8> @test_masked_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %vec2, define <32 x i8> @test_masked_z_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %mask) { ; CHECK-LABEL: test_masked_z_32xi8_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] ; CHECK-NEXT: retq %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21> @@ -298,8 +280,7 @@ define <32 x i8> @test_masked_z_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %mask define <32 x i8> @test_masked_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) { ; CHECK-LABEL: test_masked_32xi8_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmb %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -312,8 +293,7 @@ define <32 x i8> @test_masked_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %vec2, define <32 x i8> @test_masked_z_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %mask) { ; CHECK-LABEL: test_masked_z_32xi8_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] ; CHECK-NEXT: retq %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 0, i32 4, i32 3, i32 15, i32 5, i32 4, i32 5, i32 15, i32 10, i32 9, i32 11, i32 6, i32 6, i32 10, i32 0, i32 3, i32 21, i32 19, i32 26, i32 22, i32 30, i32 25, i32 22, i32 22, i32 27, i32 22, i32 26, i32 16, i32 23, i32 20, i32 18, i32 24> @@ -324,8 +304,7 @@ define <32 x i8> @test_masked_z_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %mask define <32 x i8> @test_masked_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) { ; CHECK-LABEL: test_masked_32xi8_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmb %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -338,8 +317,7 @@ define <32 x i8> @test_masked_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %vec2, define <32 x i8> @test_masked_z_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %mask) { ; CHECK-LABEL: test_masked_z_32xi8_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] ; CHECK-NEXT: retq %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 7, i32 8, i32 12, i32 14, i32 7, i32 4, i32 7, i32 12, i32 14, i32 12, i32 3, i32 15, i32 10, i32 1, i32 11, i32 15, i32 22, i32 26, i32 21, i32 19, i32 27, i32 16, i32 29, i32 24, i32 17, i32 17, i32 26, i32 29, i32 20, i32 31, i32 17, i32 29> @@ -358,8 +336,7 @@ define <32 x i8> @test_32xi8_perm_mask3(<32 x i8> %vec) { define <32 x i8> @test_masked_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) { ; CHECK-LABEL: test_masked_32xi8_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmb %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -372,8 +349,7 @@ define <32 x i8> @test_masked_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %vec2, define <32 x i8> @test_masked_z_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %mask) { ; CHECK-LABEL: test_masked_z_32xi8_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] ; CHECK-NEXT: retq %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18> @@ -395,8 +371,7 @@ define <32 x i8> @test_masked_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> %ve ; CHECK-LABEL: test_masked_32xi8_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 +; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] ; CHECK-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %vp @@ -410,8 +385,7 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> % ; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 +; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] ; CHECK-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %vp @@ -425,8 +399,7 @@ define <32 x i8> @test_masked_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> %ve ; CHECK-LABEL: test_masked_32xi8_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 +; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] ; CHECK-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %vp @@ -440,8 +413,7 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> % ; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 +; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] ; CHECK-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %vp @@ -455,8 +427,7 @@ define <32 x i8> @test_masked_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> %ve ; CHECK-LABEL: test_masked_32xi8_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 +; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] ; CHECK-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %vp @@ -470,8 +441,7 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> % ; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 +; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] ; CHECK-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %vp @@ -495,8 +465,7 @@ define <32 x i8> @test_masked_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> %ve ; CHECK-LABEL: test_masked_32xi8_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 +; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] ; CHECK-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %vp @@ -510,8 +479,7 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> % ; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 +; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] ; CHECK-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %vp @@ -532,8 +500,7 @@ define <64 x i8> @test_64xi8_perm_mask0(<64 x i8> %vec) { define <64 x i8> @test_masked_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) { ; CHECK-LABEL: test_masked_64xi8_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -546,8 +513,7 @@ define <64 x i8> @test_masked_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %vec2, define <64 x i8> @test_masked_z_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %mask) { ; CHECK-LABEL: test_masked_z_64xi8_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmb %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] ; CHECK-NEXT: retq %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62> @@ -558,8 +524,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %mask define <64 x i8> @test_masked_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) { ; CHECK-LABEL: test_masked_64xi8_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -572,8 +537,7 @@ define <64 x i8> @test_masked_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %vec2, define <64 x i8> @test_masked_z_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %mask) { ; CHECK-LABEL: test_masked_z_64xi8_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmb %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] ; CHECK-NEXT: retq %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 7, i32 14, i32 15, i32 10, i32 9, i32 3, i32 1, i32 13, i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 30, i32 30, i32 22, i32 17, i32 28, i32 27, i32 16, i32 23, i32 26, i32 16, i32 30, i32 31, i32 27, i32 17, i32 17, i32 21, i32 32, i32 37, i32 32, i32 47, i32 45, i32 33, i32 46, i32 35, i32 35, i32 42, i32 47, i32 33, i32 32, i32 37, i32 32, i32 41, i32 61, i32 50, i32 49, i32 53, i32 63, i32 50, i32 63, i32 53, i32 55, i32 52, i32 62, i32 63, i32 58, i32 50, i32 63, i32 49> @@ -584,8 +548,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %mask define <64 x i8> @test_masked_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) { ; CHECK-LABEL: test_masked_64xi8_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -598,8 +561,7 @@ define <64 x i8> @test_masked_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %vec2, define <64 x i8> @test_masked_z_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %mask) { ; CHECK-LABEL: test_masked_z_64xi8_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmb %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] ; CHECK-NEXT: retq %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12, i32 4, i32 6, i32 0, i32 2, i32 0, i32 1, i32 1, i32 6, i32 24, i32 27, i32 18, i32 22, i32 26, i32 17, i32 23, i32 21, i32 31, i32 16, i32 22, i32 22, i32 27, i32 21, i32 19, i32 20, i32 39, i32 47, i32 44, i32 36, i32 40, i32 43, i32 44, i32 39, i32 38, i32 44, i32 38, i32 35, i32 39, i32 46, i32 34, i32 39, i32 58, i32 55, i32 51, i32 48, i32 59, i32 57, i32 48, i32 52, i32 60, i32 58, i32 56, i32 50, i32 59, i32 55, i32 58, i32 60> @@ -618,8 +580,7 @@ define <64 x i8> @test_64xi8_perm_mask3(<64 x i8> %vec) { define <64 x i8> @test_masked_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) { ; CHECK-LABEL: test_masked_64xi8_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -632,8 +593,7 @@ define <64 x i8> @test_masked_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %vec2, define <64 x i8> @test_masked_z_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %mask) { ; CHECK-LABEL: test_masked_z_64xi8_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmb %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] ; CHECK-NEXT: retq %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61> @@ -655,8 +615,7 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %ve ; CHECK-LABEL: test_masked_64xi8_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 +; CHECK-NEXT: vptestnmb %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] ; CHECK-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %vp @@ -670,8 +629,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> % ; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 +; CHECK-NEXT: vptestnmb %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] ; CHECK-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %vp @@ -685,8 +643,7 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %ve ; CHECK-LABEL: test_masked_64xi8_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 +; CHECK-NEXT: vptestnmb %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] ; CHECK-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %vp @@ -700,8 +657,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> % ; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 +; CHECK-NEXT: vptestnmb %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] ; CHECK-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %vp @@ -715,8 +671,7 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %ve ; CHECK-LABEL: test_masked_64xi8_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 +; CHECK-NEXT: vptestnmb %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] ; CHECK-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %vp @@ -730,8 +685,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> % ; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 +; CHECK-NEXT: vptestnmb %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] ; CHECK-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %vp @@ -755,8 +709,7 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %ve ; CHECK-LABEL: test_masked_64xi8_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 +; CHECK-NEXT: vptestnmb %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] ; CHECK-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %vp @@ -770,8 +723,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> % ; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 +; CHECK-NEXT: vptestnmb %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] ; CHECK-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %vp @@ -792,8 +744,7 @@ define <8 x i16> @test_8xi16_perm_high_mask0(<8 x i16> %vec) { define <8 x i16> @test_masked_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_8xi16_perm_high_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 +; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6] ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -806,8 +757,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %v define <8 x i16> @test_masked_z_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6> @@ -818,8 +768,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> define <8 x i16> @test_masked_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_8xi16_perm_low_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 +; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7] ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -832,8 +781,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %ve define <8 x i16> @test_masked_z_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7> @@ -844,8 +792,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> % define <8 x i16> @test_masked_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_8xi16_perm_high_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 +; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5] ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -858,8 +805,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %v define <8 x i16> @test_masked_z_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 4, i32 5> @@ -878,8 +824,7 @@ define <8 x i16> @test_8xi16_perm_low_mask3(<8 x i16> %vec) { define <8 x i16> @test_masked_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_8xi16_perm_low_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 +; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7] ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -892,8 +837,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %ve define <8 x i16> @test_masked_z_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7> @@ -904,8 +848,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> % define <8 x i16> @test_masked_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_8xi16_perm_high_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 +; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6] ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -918,8 +861,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %v define <8 x i16> @test_masked_z_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 7, i32 6> @@ -930,8 +872,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> define <8 x i16> @test_masked_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_8xi16_perm_low_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 +; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7] ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -944,8 +885,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %ve define <8 x i16> @test_masked_z_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 1, i32 4, i32 5, i32 6, i32 7> @@ -964,8 +904,7 @@ define <8 x i16> @test_8xi16_perm_high_mask6(<8 x i16> %vec) { define <8 x i16> @test_masked_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_8xi16_perm_high_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 +; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5] ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -978,8 +917,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %v define <8 x i16> @test_masked_z_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5> @@ -990,8 +928,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> define <8 x i16> @test_masked_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_8xi16_perm_low_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 +; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7] ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -1004,8 +941,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %ve define <8 x i16> @test_masked_z_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7> @@ -1025,8 +961,7 @@ define <8 x i16> @test_8xi16_perm_high_mem_mask0(<8 x i16>* %vp) { define <8 x i16> @test_masked_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6] ; CHECK-NEXT: retq %vec = load <8 x i16>, <8 x i16>* %vp @@ -1039,8 +974,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16 define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6] ; CHECK-NEXT: retq %vec = load <8 x i16>, <8 x i16>* %vp @@ -1053,8 +987,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i define <8 x i16> @test_masked_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7] ; CHECK-NEXT: retq %vec = load <8 x i16>, <8 x i16>* %vp @@ -1067,8 +1000,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7] ; CHECK-NEXT: retq %vec = load <8 x i16>, <8 x i16>* %vp @@ -1081,8 +1013,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i1 define <8 x i16> @test_masked_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7] ; CHECK-NEXT: retq %vec = load <8 x i16>, <8 x i16>* %vp @@ -1095,8 +1026,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16 define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7] ; CHECK-NEXT: retq %vec = load <8 x i16>, <8 x i16>* %vp @@ -1118,8 +1048,7 @@ define <8 x i16> @test_8xi16_perm_low_mem_mask3(<8 x i16>* %vp) { define <8 x i16> @test_masked_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7] ; CHECK-NEXT: retq %vec = load <8 x i16>, <8 x i16>* %vp @@ -1132,8 +1061,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7] ; CHECK-NEXT: retq %vec = load <8 x i16>, <8 x i16>* %vp @@ -1146,8 +1074,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i1 define <8 x i16> @test_masked_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5] ; CHECK-NEXT: retq %vec = load <8 x i16>, <8 x i16>* %vp @@ -1160,8 +1087,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16 define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5] ; CHECK-NEXT: retq %vec = load <8 x i16>, <8 x i16>* %vp @@ -1174,8 +1100,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i define <8 x i16> @test_masked_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7] ; CHECK-NEXT: retq %vec = load <8 x i16>, <8 x i16>* %vp @@ -1188,8 +1113,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7] ; CHECK-NEXT: retq %vec = load <8 x i16>, <8 x i16>* %vp @@ -1211,8 +1135,7 @@ define <8 x i16> @test_8xi16_perm_high_mem_mask6(<8 x i16>* %vp) { define <8 x i16> @test_masked_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4] ; CHECK-NEXT: retq %vec = load <8 x i16>, <8 x i16>* %vp @@ -1225,8 +1148,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16 define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4] ; CHECK-NEXT: retq %vec = load <8 x i16>, <8 x i16>* %vp @@ -1239,8 +1161,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i define <8 x i16> @test_masked_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7] ; CHECK-NEXT: retq %vec = load <8 x i16>, <8 x i16>* %vp @@ -1253,8 +1174,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7] ; CHECK-NEXT: retq %vec = load <8 x i16>, <8 x i16>* %vp @@ -1275,8 +1195,7 @@ define <16 x i16> @test_16xi16_perm_high_mask0(<16 x i16> %vec) { define <16 x i16> @test_masked_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_perm_high_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -1289,8 +1208,7 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16 define <16 x i16> @test_masked_z_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] ; CHECK-NEXT: retq %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12> @@ -1301,8 +1219,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i define <16 x i16> @test_masked_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_perm_low_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -1315,8 +1232,7 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> define <16 x i16> @test_masked_z_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] ; CHECK-NEXT: retq %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 8, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15> @@ -1327,8 +1243,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i1 define <16 x i16> @test_masked_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_perm_high_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -1341,8 +1256,7 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16 define <16 x i16> @test_masked_z_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] ; CHECK-NEXT: retq %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 5, i32 5, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 13, i32 13, i32 13> @@ -1361,8 +1275,7 @@ define <16 x i16> @test_16xi16_perm_low_mask3(<16 x i16> %vec) { define <16 x i16> @test_masked_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_perm_low_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -1375,8 +1288,7 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> define <16 x i16> @test_masked_z_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] ; CHECK-NEXT: retq %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15> @@ -1387,8 +1299,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i1 define <16 x i16> @test_masked_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_perm_high_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -1401,8 +1312,7 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16 define <16 x i16> @test_masked_z_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] ; CHECK-NEXT: retq %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 12, i32 15> @@ -1413,8 +1323,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i define <16 x i16> @test_masked_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_perm_low_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -1427,8 +1336,7 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> define <16 x i16> @test_masked_z_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] ; CHECK-NEXT: retq %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15> @@ -1447,8 +1355,7 @@ define <16 x i16> @test_16xi16_perm_high_mask6(<16 x i16> %vec) { define <16 x i16> @test_masked_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_perm_high_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -1461,8 +1368,7 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16 define <16 x i16> @test_masked_z_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] ; CHECK-NEXT: retq %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13> @@ -1473,8 +1379,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i define <16 x i16> @test_masked_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_perm_low_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -1487,8 +1392,7 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> define <16 x i16> @test_masked_z_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] ; CHECK-NEXT: retq %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 10, i32 12, i32 13, i32 14, i32 15> @@ -1508,8 +1412,7 @@ define <16 x i16> @test_16xi16_perm_high_mem_mask0(<16 x i16>* %vp) { define <16 x i16> @test_masked_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp @@ -1522,8 +1425,7 @@ define <16 x i16> @test_masked_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp @@ -1536,8 +1438,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 define <16 x i16> @test_masked_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp @@ -1550,8 +1451,7 @@ define <16 x i16> @test_masked_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp @@ -1564,8 +1464,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 define <16 x i16> @test_masked_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp @@ -1578,8 +1477,7 @@ define <16 x i16> @test_masked_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp @@ -1601,8 +1499,7 @@ define <16 x i16> @test_16xi16_perm_low_mem_mask3(<16 x i16>* %vp) { define <16 x i16> @test_masked_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp @@ -1615,8 +1512,7 @@ define <16 x i16> @test_masked_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp @@ -1629,8 +1525,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 define <16 x i16> @test_masked_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp @@ -1643,8 +1538,7 @@ define <16 x i16> @test_masked_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp @@ -1657,8 +1551,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 define <16 x i16> @test_masked_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp @@ -1671,8 +1564,7 @@ define <16 x i16> @test_masked_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp @@ -1694,8 +1586,7 @@ define <16 x i16> @test_16xi16_perm_high_mem_mask6(<16 x i16>* %vp) { define <16 x i16> @test_masked_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp @@ -1708,8 +1599,7 @@ define <16 x i16> @test_masked_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp @@ -1722,8 +1612,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 define <16 x i16> @test_masked_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp @@ -1736,8 +1625,7 @@ define <16 x i16> @test_masked_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp @@ -1758,8 +1646,7 @@ define <32 x i16> @test_32xi16_perm_high_mask0(<32 x i16> %vec) { define <32 x i16> @test_masked_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_perm_high_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -1772,8 +1659,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16 define <32 x i16> @test_masked_z_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28> @@ -1784,8 +1670,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i define <32 x i16> @test_masked_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_perm_low_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -1798,8 +1683,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> define <32 x i16> @test_masked_z_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31> @@ -1810,8 +1694,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i1 define <32 x i16> @test_masked_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_perm_high_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -1824,8 +1707,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16 define <32 x i16> @test_masked_z_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 14, i32 12, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 22, i32 20, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 30, i32 28, i32 31> @@ -1844,8 +1726,7 @@ define <32 x i16> @test_32xi16_perm_low_mask3(<32 x i16> %vec) { define <32 x i16> @test_masked_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_perm_low_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -1858,8 +1739,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> define <32 x i16> @test_masked_z_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31> @@ -1870,8 +1750,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i1 define <32 x i16> @test_masked_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_perm_high_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -1884,8 +1763,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16 define <32 x i16> @test_masked_z_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 23, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 31, i32 29, i32 30> @@ -1896,8 +1774,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i define <32 x i16> @test_masked_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_perm_low_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -1910,8 +1787,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> define <32 x i16> @test_masked_z_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31> @@ -1930,8 +1806,7 @@ define <32 x i16> @test_32xi16_perm_high_mask6(<32 x i16> %vec) { define <32 x i16> @test_masked_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_perm_high_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -1944,8 +1819,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16 define <32 x i16> @test_masked_z_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30> @@ -1956,8 +1830,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i define <32 x i16> @test_masked_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_perm_low_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -1970,8 +1843,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> define <32 x i16> @test_masked_z_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 0, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31> @@ -1991,8 +1863,7 @@ define <32 x i16> @test_32xi16_perm_high_mem_mask0(<32 x i16>* %vp) { define <32 x i16> @test_masked_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -2005,8 +1876,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -2019,8 +1889,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 define <32 x i16> @test_masked_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -2033,8 +1902,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -2047,8 +1915,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 define <32 x i16> @test_masked_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -2061,8 +1928,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -2084,8 +1950,7 @@ define <32 x i16> @test_32xi16_perm_low_mem_mask3(<32 x i16>* %vp) { define <32 x i16> @test_masked_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -2098,8 +1963,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -2112,8 +1976,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 define <32 x i16> @test_masked_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -2126,8 +1989,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -2141,8 +2003,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x ; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask5: ; CHECK: # %bb.0: ; CHECK-NEXT: vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 +; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -2156,8 +2017,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 ; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask5: ; CHECK: # %bb.0: ; CHECK-NEXT: vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 +; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1 ; CHECK-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -2179,8 +2039,7 @@ define <32 x i16> @test_32xi16_perm_high_mem_mask6(<32 x i16>* %vp) { define <32 x i16> @test_masked_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -2193,8 +2052,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -2207,8 +2065,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 define <32 x i16> @test_masked_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -2221,8 +2078,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -2243,8 +2099,7 @@ define <4 x i32> @test_4xi32_perm_mask0(<4 x i32> %vec) { define <4 x i32> @test_masked_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_4xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0] ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -2257,8 +2112,7 @@ define <4 x i32> @test_masked_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %vec2, define <4 x i32> @test_masked_z_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_4xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0] ; CHECK-NEXT: retq %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0> @@ -2269,8 +2123,7 @@ define <4 x i32> @test_masked_z_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %mask define <4 x i32> @test_masked_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_4xi32_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0] ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -2283,8 +2136,7 @@ define <4 x i32> @test_masked_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %vec2, define <4 x i32> @test_masked_z_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_4xi32_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0] ; CHECK-NEXT: retq %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 0> @@ -2295,8 +2147,7 @@ define <4 x i32> @test_masked_z_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %mask define <4 x i32> @test_masked_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_4xi32_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0] ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -2309,8 +2160,7 @@ define <4 x i32> @test_masked_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %vec2, define <4 x i32> @test_masked_z_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_4xi32_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0] ; CHECK-NEXT: retq %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 0> @@ -2329,8 +2179,7 @@ define <4 x i32> @test_4xi32_perm_mask3(<4 x i32> %vec) { define <4 x i32> @test_masked_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_4xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3] ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -2343,8 +2192,7 @@ define <4 x i32> @test_masked_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %vec2, define <4 x i32> @test_masked_z_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_4xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3] ; CHECK-NEXT: retq %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3> @@ -2364,8 +2212,7 @@ define <4 x i32> @test_4xi32_perm_mem_mask0(<4 x i32>* %vp) { define <4 x i32> @test_masked_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3] ; CHECK-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %vp @@ -2378,8 +2225,7 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %ve define <4 x i32> @test_masked_z_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3] ; CHECK-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %vp @@ -2392,8 +2238,7 @@ define <4 x i32> @test_masked_z_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> % define <4 x i32> @test_masked_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1] ; CHECK-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %vp @@ -2406,8 +2251,7 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %ve define <4 x i32> @test_masked_z_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1] ; CHECK-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %vp @@ -2420,8 +2264,7 @@ define <4 x i32> @test_masked_z_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> % define <4 x i32> @test_masked_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_4xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1] ; CHECK-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %vp @@ -2434,8 +2277,7 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %ve define <4 x i32> @test_masked_z_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1] ; CHECK-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %vp @@ -2457,8 +2299,7 @@ define <4 x i32> @test_4xi32_perm_mem_mask3(<4 x i32>* %vp) { define <4 x i32> @test_masked_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0] ; CHECK-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %vp @@ -2471,8 +2312,7 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %ve define <4 x i32> @test_masked_z_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0] ; CHECK-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %vp @@ -2493,8 +2333,7 @@ define <8 x i32> @test_8xi32_perm_mask0(<8 x i32> %vec) { define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -2507,8 +2346,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, define <8 x i32> @test_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4> @@ -2519,8 +2357,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -2533,8 +2370,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, define <8 x i32> @test_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 3, i32 4, i32 7, i32 7, i32 7> @@ -2545,8 +2381,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -2559,8 +2394,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, define <8 x i32> @test_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 5, i32 6, i32 4, i32 7> @@ -2579,8 +2413,7 @@ define <8 x i32> @test_8xi32_perm_mask3(<8 x i32> %vec) { define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -2593,8 +2426,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, define <8 x i32> @test_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4> @@ -2614,8 +2446,7 @@ define <8 x i32> @test_8xi32_perm_mem_mask0(<8 x i32>* %vp) { define <8 x i32> @test_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4] ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp @@ -2628,8 +2459,7 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %ve define <8 x i32> @test_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4] ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp @@ -2642,8 +2472,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> % define <8 x i32> @test_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4] ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp @@ -2656,8 +2485,7 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %ve define <8 x i32> @test_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp @@ -2670,8 +2498,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> % define <8 x i32> @test_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5] ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp @@ -2684,8 +2511,7 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %ve define <8 x i32> @test_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5] ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp @@ -2707,8 +2533,7 @@ define <8 x i32> @test_8xi32_perm_mem_mask3(<8 x i32>* %vp) { define <8 x i32> @test_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4] ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp @@ -2721,8 +2546,7 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %ve define <8 x i32> @test_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4] ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp @@ -2743,8 +2567,7 @@ define <16 x i32> @test_16xi32_perm_mask0(<16 x i32> %vec) { define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -2757,8 +2580,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %ve define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12> @@ -2769,8 +2591,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> % define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -2783,8 +2604,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %ve define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 0, i32 6, i32 4, i32 7, i32 4, i32 10, i32 8, i32 11, i32 8, i32 14, i32 12, i32 15, i32 12> @@ -2795,8 +2615,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> % define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -2809,8 +2628,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %ve define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4, i32 9, i32 11, i32 11, i32 8, i32 13, i32 15, i32 15, i32 12> @@ -2829,8 +2647,7 @@ define <16 x i32> @test_16xi32_perm_mask3(<16 x i32> %vec) { define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -2843,8 +2660,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %ve define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15> @@ -2864,8 +2680,7 @@ define <16 x i32> @test_16xi32_perm_mem_mask0(<16 x i32>* %vp) { define <16 x i32> @test_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -2878,8 +2693,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -2892,8 +2706,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i3 define <16 x i32> @test_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -2906,8 +2719,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -2920,8 +2732,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i3 define <16 x i32> @test_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -2934,8 +2745,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -2957,8 +2767,7 @@ define <16 x i32> @test_16xi32_perm_mem_mask3(<16 x i32>* %vp) { define <16 x i32> @test_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -2971,8 +2780,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> define <16 x i32> @test_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp diff --git a/llvm/test/CodeGen/X86/avx512-vbroadcast.ll b/llvm/test/CodeGen/X86/avx512-vbroadcast.ll index 09e7e646ca4..9792c4990c6 100644 --- a/llvm/test/CodeGen/X86/avx512-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx512-vbroadcast.ll @@ -44,8 +44,7 @@ define <16 x float> @_inreg16xfloat(float %a) { define <16 x float> @_ss16xfloat_mask(float %a, <16 x float> %i, <16 x i32> %mask1) { ; ALL-LABEL: _ss16xfloat_mask: ; ALL: # %bb.0: -; ALL-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; ALL-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 +; ALL-NEXT: vptestmd %zmm2, %zmm2, %k1 ; ALL-NEXT: vbroadcastss %xmm0, %zmm1 {%k1} ; ALL-NEXT: vmovaps %zmm1, %zmm0 ; ALL-NEXT: retq @@ -59,8 +58,7 @@ define <16 x float> @_ss16xfloat_mask(float %a, <16 x float> %i, <16 x i32> %m define <16 x float> @_ss16xfloat_maskz(float %a, <16 x i32> %mask1) { ; ALL-LABEL: _ss16xfloat_maskz: ; ALL: # %bb.0: -; ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; ALL-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; ALL-NEXT: vptestmd %zmm1, %zmm1, %k1 ; ALL-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} ; ALL-NEXT: retq %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -84,8 +82,7 @@ define <16 x float> @_ss16xfloat_load(float* %a.ptr) { define <16 x float> @_ss16xfloat_mask_load(float* %a.ptr, <16 x float> %i, <16 x i32> %mask1) { ; ALL-LABEL: _ss16xfloat_mask_load: ; ALL: # %bb.0: -; ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; ALL-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; ALL-NEXT: vptestmd %zmm1, %zmm1, %k1 ; ALL-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} ; ALL-NEXT: retq %a = load float, float* %a.ptr @@ -99,8 +96,7 @@ define <16 x float> @_ss16xfloat_mask_load(float* %a.ptr, <16 x float> %i, <16 define <16 x float> @_ss16xfloat_maskz_load(float* %a.ptr, <16 x i32> %mask1) { ; ALL-LABEL: _ss16xfloat_maskz_load: ; ALL: # %bb.0: -; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 +; ALL-NEXT: vptestmd %zmm0, %zmm0, %k1 ; ALL-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} {z} ; ALL-NEXT: retq %a = load float, float* %a.ptr @@ -125,8 +121,7 @@ define <8 x double> @_sd8xdouble_mask(double %a, <8 x double> %i, <8 x i32> %m ; ALL-LABEL: _sd8xdouble_mask: ; ALL: # %bb.0: ; ALL-NEXT: # kill: def %ymm2 killed %ymm2 def %zmm2 -; ALL-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; ALL-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 +; ALL-NEXT: vptestmd %zmm2, %zmm2, %k1 ; ALL-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1} ; ALL-NEXT: vmovapd %zmm1, %zmm0 ; ALL-NEXT: retq @@ -141,8 +136,7 @@ define <8 x double> @_sd8xdouble_maskz(double %a, <8 x i32> %mask1) { ; ALL-LABEL: _sd8xdouble_maskz: ; ALL: # %bb.0: ; ALL-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1 -; ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; ALL-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; ALL-NEXT: vptestmd %zmm1, %zmm1, %k1 ; ALL-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} ; ALL-NEXT: retq %mask = icmp ne <8 x i32> %mask1, zeroinitializer @@ -167,8 +161,7 @@ define <8 x double> @_sd8xdouble_mask_load(double* %a.ptr, <8 x double> %i, <8 ; ALL-LABEL: _sd8xdouble_mask_load: ; ALL: # %bb.0: ; ALL-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1 -; ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; ALL-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; ALL-NEXT: vptestmd %zmm1, %zmm1, %k1 ; ALL-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} ; ALL-NEXT: retq %a = load double, double* %a.ptr @@ -183,8 +176,7 @@ define <8 x double> @_sd8xdouble_maskz_load(double* %a.ptr, <8 x i32> %mask1) ; ALL-LABEL: _sd8xdouble_maskz_load: ; ALL: # %bb.0: ; ALL-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0 -; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 +; ALL-NEXT: vptestmd %zmm0, %zmm0, %k1 ; ALL-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} {z} ; ALL-NEXT: retq %a = load double, double* %a.ptr diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll index 3d552da7330..b84d61b5a25 100644 --- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll @@ -926,8 +926,7 @@ define <2 x i64> @test46(<2 x float> %x, <2 x float> %y) #0 { define <16 x i8> @test47(<16 x i32> %a, <16 x i8> %b, <16 x i8> %c) { ; KNL-LABEL: test47: ; KNL: ## %bb.0: -; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; KNL-NEXT: vpcmpeqd %zmm3, %zmm0, %k1 +; KNL-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 @@ -938,8 +937,7 @@ define <16 x i8> @test47(<16 x i32> %a, <16 x i8> %b, <16 x i8> %c) { ; AVX512BW: ## %bb.0: ; AVX512BW-NEXT: ## kill: def %xmm2 killed %xmm2 def %zmm2 ; AVX512BW-NEXT: ## kill: def %xmm1 killed %xmm1 def %zmm1 -; AVX512BW-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512BW-NEXT: vpcmpeqd %zmm3, %zmm0, %k1 +; AVX512BW-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512BW-NEXT: vpblendmb %zmm1, %zmm2, %zmm0 {%k1} ; AVX512BW-NEXT: ## kill: def %xmm0 killed %xmm0 killed %zmm0 ; AVX512BW-NEXT: vzeroupper @@ -947,8 +945,7 @@ define <16 x i8> @test47(<16 x i32> %a, <16 x i8> %b, <16 x i8> %c) { ; ; SKX-LABEL: test47: ; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; SKX-NEXT: vpcmpeqd %zmm3, %zmm0, %k1 +; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; SKX-NEXT: vpblendmb %xmm1, %xmm2, %xmm0 {%k1} ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -960,8 +957,7 @@ define <16 x i8> @test47(<16 x i32> %a, <16 x i8> %b, <16 x i8> %c) { define <16 x i16> @test48(<16 x i32> %a, <16 x i16> %b, <16 x i16> %c) { ; KNL-LABEL: test48: ; KNL: ## %bb.0: -; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; KNL-NEXT: vpcmpeqd %zmm3, %zmm0, %k1 +; KNL-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdw %zmm0, %ymm0 ; KNL-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 @@ -971,16 +967,14 @@ define <16 x i16> @test48(<16 x i32> %a, <16 x i16> %b, <16 x i16> %c) { ; AVX512BW: ## %bb.0: ; AVX512BW-NEXT: ## kill: def %ymm2 killed %ymm2 def %zmm2 ; AVX512BW-NEXT: ## kill: def %ymm1 killed %ymm1 def %zmm1 -; AVX512BW-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512BW-NEXT: vpcmpeqd %zmm3, %zmm0, %k1 +; AVX512BW-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512BW-NEXT: vpblendmw %zmm1, %zmm2, %zmm0 {%k1} ; AVX512BW-NEXT: ## kill: def %ymm0 killed %ymm0 killed %zmm0 ; AVX512BW-NEXT: retq ; ; SKX-LABEL: test48: ; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; SKX-NEXT: vpcmpeqd %zmm3, %zmm0, %k1 +; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; SKX-NEXT: vpblendmw %ymm1, %ymm2, %ymm0 {%k1} ; SKX-NEXT: retq %cmp = icmp eq <16 x i32> %a, zeroinitializer @@ -991,8 +985,7 @@ define <16 x i16> @test48(<16 x i32> %a, <16 x i16> %b, <16 x i16> %c) { define <8 x i16> @test49(<8 x i64> %a, <8 x i16> %b, <8 x i16> %c) { ; KNL-LABEL: test49: ; KNL: ## %bb.0: -; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; KNL-NEXT: vpcmpeqq %zmm3, %zmm0, %k1 +; KNL-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdw %zmm0, %ymm0 ; KNL-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 @@ -1003,8 +996,7 @@ define <8 x i16> @test49(<8 x i64> %a, <8 x i16> %b, <8 x i16> %c) { ; AVX512BW: ## %bb.0: ; AVX512BW-NEXT: ## kill: def %xmm2 killed %xmm2 def %zmm2 ; AVX512BW-NEXT: ## kill: def %xmm1 killed %xmm1 def %zmm1 -; AVX512BW-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512BW-NEXT: vpcmpeqq %zmm3, %zmm0, %k1 +; AVX512BW-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; AVX512BW-NEXT: vpblendmw %zmm1, %zmm2, %zmm0 {%k1} ; AVX512BW-NEXT: ## kill: def %xmm0 killed %xmm0 killed %zmm0 ; AVX512BW-NEXT: vzeroupper @@ -1012,8 +1004,7 @@ define <8 x i16> @test49(<8 x i64> %a, <8 x i16> %b, <8 x i16> %c) { ; ; SKX-LABEL: test49: ; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; SKX-NEXT: vpcmpeqq %zmm3, %zmm0, %k1 +; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; SKX-NEXT: vpblendmw %xmm1, %xmm2, %xmm0 {%k1} ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512bw-mov.ll b/llvm/test/CodeGen/X86/avx512bw-mov.ll index 7158fb262c0..e968d76994f 100644 --- a/llvm/test/CodeGen/X86/avx512bw-mov.ll +++ b/llvm/test/CodeGen/X86/avx512bw-mov.ll @@ -24,8 +24,7 @@ define void @test2(i8 * %addr, <64 x i8> %data) { define <64 x i8> @test3(i8 * %addr, <64 x i8> %old, <64 x i8> %mask1) { ; CHECK-LABEL: test3: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpneqb %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestmb %zmm1, %zmm1, %k1 ; CHECK-NEXT: vmovdqu8 (%rdi), %zmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp ne <64 x i8> %mask1, zeroinitializer @@ -38,8 +37,7 @@ define <64 x i8> @test3(i8 * %addr, <64 x i8> %old, <64 x i8> %mask1) { define <64 x i8> @test4(i8 * %addr, <64 x i8> %mask1) { ; CHECK-LABEL: test4: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpneqb %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestmb %zmm0, %zmm0, %k1 ; CHECK-NEXT: vmovdqu8 (%rdi), %zmm0 {%k1} {z} ; CHECK-NEXT: retq %mask = icmp ne <64 x i8> %mask1, zeroinitializer @@ -72,8 +70,7 @@ define void @test6(i8 * %addr, <32 x i16> %data) { define <32 x i16> @test7(i8 * %addr, <32 x i16> %old, <32 x i16> %mask1) { ; CHECK-LABEL: test7: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpneqw %zmm2, %zmm1, %k1 +; CHECK-NEXT: vptestmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp ne <32 x i16> %mask1, zeroinitializer @@ -86,8 +83,7 @@ define <32 x i16> @test7(i8 * %addr, <32 x i16> %old, <32 x i16> %mask1) { define <32 x i16> @test8(i8 * %addr, <32 x i16> %mask1) { ; CHECK-LABEL: test8: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpneqw %zmm1, %zmm0, %k1 +; CHECK-NEXT: vptestmw %zmm0, %zmm0, %k1 ; CHECK-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z} ; CHECK-NEXT: retq %mask = icmp ne <32 x i16> %mask1, zeroinitializer diff --git a/llvm/test/CodeGen/X86/avx512bwvl-mov.ll b/llvm/test/CodeGen/X86/avx512bwvl-mov.ll index 1826890d49c..508595e4366 100644 --- a/llvm/test/CodeGen/X86/avx512bwvl-mov.ll +++ b/llvm/test/CodeGen/X86/avx512bwvl-mov.ll @@ -24,8 +24,7 @@ define void @test_256_2(i8 * %addr, <32 x i8> %data) { define <32 x i8> @test_256_3(i8 * %addr, <32 x i8> %old, <32 x i8> %mask1) { ; CHECK-LABEL: test_256_3: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqb %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x3f,0xca,0x04] +; CHECK-NEXT: vptestmb %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0x75,0x28,0x26,0xc9] ; CHECK-NEXT: vmovdqu8 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7f,0x29,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <32 x i8> %mask1, zeroinitializer @@ -38,8 +37,7 @@ define <32 x i8> @test_256_3(i8 * %addr, <32 x i8> %old, <32 x i8> %mask1) { define <32 x i8> @test_256_4(i8 * %addr, <32 x i8> %mask1) { ; CHECK-LABEL: test_256_4: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] -; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc9,0x04] +; CHECK-NEXT: vptestmb %ymm0, %ymm0, %k1 ## encoding: [0x62,0xf2,0x7d,0x28,0x26,0xc8] ; CHECK-NEXT: vmovdqu8 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0xa9,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <32 x i8> %mask1, zeroinitializer @@ -72,8 +70,7 @@ define void @test_256_6(i8 * %addr, <16 x i16> %data) { define <16 x i16> @test_256_7(i8 * %addr, <16 x i16> %old, <16 x i16> %mask1) { ; CHECK-LABEL: test_256_7: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqw %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x3f,0xca,0x04] +; CHECK-NEXT: vptestmw %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x28,0x26,0xc9] ; CHECK-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <16 x i16> %mask1, zeroinitializer @@ -86,8 +83,7 @@ define <16 x i16> @test_256_7(i8 * %addr, <16 x i16> %old, <16 x i16> %mask1) { define <16 x i16> @test_256_8(i8 * %addr, <16 x i16> %mask1) { ; CHECK-LABEL: test_256_8: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] -; CHECK-NEXT: vpcmpneqw %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc9,0x04] +; CHECK-NEXT: vptestmw %ymm0, %ymm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x28,0x26,0xc8] ; CHECK-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xa9,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <16 x i16> %mask1, zeroinitializer @@ -120,8 +116,7 @@ define void @test_128_2(i8 * %addr, <16 x i8> %data) { define <16 x i8> @test_128_3(i8 * %addr, <16 x i8> %old, <16 x i8> %mask1) { ; CHECK-LABEL: test_128_3: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqb %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x3f,0xca,0x04] +; CHECK-NEXT: vptestmb %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x08,0x26,0xc9] ; CHECK-NEXT: vmovdqu8 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7f,0x09,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <16 x i8> %mask1, zeroinitializer @@ -134,8 +129,7 @@ define <16 x i8> @test_128_3(i8 * %addr, <16 x i8> %old, <16 x i8> %mask1) { define <16 x i8> @test_128_4(i8 * %addr, <16 x i8> %mask1) { ; CHECK-LABEL: test_128_4: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] -; CHECK-NEXT: vpcmpneqb %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc9,0x04] +; CHECK-NEXT: vptestmb %xmm0, %xmm0, %k1 ## encoding: [0x62,0xf2,0x7d,0x08,0x26,0xc8] ; CHECK-NEXT: vmovdqu8 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0x89,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <16 x i8> %mask1, zeroinitializer @@ -168,8 +162,7 @@ define void @test_128_6(i8 * %addr, <8 x i16> %data) { define <8 x i16> @test_128_7(i8 * %addr, <8 x i16> %old, <8 x i16> %mask1) { ; CHECK-LABEL: test_128_7: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqw %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x3f,0xca,0x04] +; CHECK-NEXT: vptestmw %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x08,0x26,0xc9] ; CHECK-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <8 x i16> %mask1, zeroinitializer @@ -182,8 +175,7 @@ define <8 x i16> @test_128_7(i8 * %addr, <8 x i16> %old, <8 x i16> %mask1) { define <8 x i16> @test_128_8(i8 * %addr, <8 x i16> %mask1) { ; CHECK-LABEL: test_128_8: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] -; CHECK-NEXT: vpcmpneqw %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xc9,0x04] +; CHECK-NEXT: vptestmw %xmm0, %xmm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x08,0x26,0xc8] ; CHECK-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <8 x i16> %mask1, zeroinitializer diff --git a/llvm/test/CodeGen/X86/avx512vl-arith.ll b/llvm/test/CodeGen/X86/avx512vl-arith.ll index beaefe92aac..967ac3b7948 100755 --- a/llvm/test/CodeGen/X86/avx512vl-arith.ll +++ b/llvm/test/CodeGen/X86/avx512vl-arith.ll @@ -76,8 +76,7 @@ define <8 x i32> @vpaddd256_broadcast_test(<8 x i32> %i) nounwind { define <8 x i32> @vpaddd256_mask_test(<8 x i32> %i, <8 x i32> %j, <8 x i32> %mask1) nounwind readnone { ; CHECK-LABEL: vpaddd256_mask_test: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] -; CHECK-NEXT: vpcmpneqd %ymm3, %ymm2, %k1 ## encoding: [0x62,0xf3,0x6d,0x28,0x1f,0xcb,0x04] +; CHECK-NEXT: vptestmd %ymm2, %ymm2, %k1 ## encoding: [0x62,0xf2,0x6d,0x28,0x27,0xca] ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfe,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <8 x i32> %mask1, zeroinitializer @@ -89,8 +88,7 @@ define <8 x i32> @vpaddd256_mask_test(<8 x i32> %i, <8 x i32> %j, <8 x i32> %mas define <8 x i32> @vpaddd256_maskz_test(<8 x i32> %i, <8 x i32> %j, <8 x i32> %mask1) nounwind readnone { ; CHECK-LABEL: vpaddd256_maskz_test: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] -; CHECK-NEXT: vpcmpneqd %ymm3, %ymm2, %k1 ## encoding: [0x62,0xf3,0x6d,0x28,0x1f,0xcb,0x04] +; CHECK-NEXT: vptestmd %ymm2, %ymm2, %k1 ## encoding: [0x62,0xf2,0x6d,0x28,0x27,0xca] ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfe,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <8 x i32> %mask1, zeroinitializer @@ -102,8 +100,7 @@ define <8 x i32> @vpaddd256_maskz_test(<8 x i32> %i, <8 x i32> %j, <8 x i32> %ma define <8 x i32> @vpaddd256_mask_fold_test(<8 x i32> %i, <8 x i32>* %j.ptr, <8 x i32> %mask1) nounwind readnone { ; CHECK-LABEL: vpaddd256_mask_fold_test: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04] +; CHECK-NEXT: vptestmd %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0x75,0x28,0x27,0xc9] ; CHECK-NEXT: vpaddd (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfe,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <8 x i32> %mask1, zeroinitializer @@ -116,8 +113,7 @@ define <8 x i32> @vpaddd256_mask_fold_test(<8 x i32> %i, <8 x i32>* %j.ptr, <8 x define <8 x i32> @vpaddd256_mask_broadcast_test(<8 x i32> %i, <8 x i32> %mask1) nounwind readnone { ; CHECK-LABEL: vpaddd256_mask_broadcast_test: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04] +; CHECK-NEXT: vptestmd %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0x75,0x28,0x27,0xc9] ; CHECK-NEXT: vpaddd {{.*}}(%rip){1to8}, %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xfe,0x05,A,A,A,A] ; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI10_0-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq ## encoding: [0xc3] @@ -130,8 +126,7 @@ define <8 x i32> @vpaddd256_mask_broadcast_test(<8 x i32> %i, <8 x i32> %mask1) define <8 x i32> @vpaddd256_maskz_fold_test(<8 x i32> %i, <8 x i32>* %j.ptr, <8 x i32> %mask1) nounwind readnone { ; CHECK-LABEL: vpaddd256_maskz_fold_test: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04] +; CHECK-NEXT: vptestmd %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0x75,0x28,0x27,0xc9] ; CHECK-NEXT: vpaddd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfe,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <8 x i32> %mask1, zeroinitializer @@ -144,8 +139,7 @@ define <8 x i32> @vpaddd256_maskz_fold_test(<8 x i32> %i, <8 x i32>* %j.ptr, <8 define <8 x i32> @vpaddd256_maskz_broadcast_test(<8 x i32> %i, <8 x i32> %mask1) nounwind readnone { ; CHECK-LABEL: vpaddd256_maskz_broadcast_test: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04] +; CHECK-NEXT: vptestmd %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0x75,0x28,0x27,0xc9] ; CHECK-NEXT: vpaddd {{.*}}(%rip){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xfe,0x05,A,A,A,A] ; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI12_0-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq ## encoding: [0xc3] @@ -216,8 +210,7 @@ define <8 x float> @test_broadcast_vaddpd_256(<8 x float> %a) nounwind { define <8 x float> @test_mask_vaddps_256(<8 x float> %dst, <8 x float> %i, <8 x float> %j, <8 x i32> %mask1) nounwind readnone { ; CHECK-LABEL: test_mask_vaddps_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; CHECK-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0x65,0x28,0x1f,0xcc,0x04] +; CHECK-NEXT: vptestmd %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0x65,0x28,0x27,0xcb] ; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0x58,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <8 x i32> %mask1, zeroinitializer @@ -229,8 +222,7 @@ define <8 x float> @test_mask_vaddps_256(<8 x float> %dst, <8 x float> %i, <8 x define <8 x float> @test_mask_vmulps_256(<8 x float> %dst, <8 x float> %i, <8 x float> %j, <8 x i32> %mask1) nounwind readnone { ; CHECK-LABEL: test_mask_vmulps_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; CHECK-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0x65,0x28,0x1f,0xcc,0x04] +; CHECK-NEXT: vptestmd %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0x65,0x28,0x27,0xcb] ; CHECK-NEXT: vmulps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0x59,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <8 x i32> %mask1, zeroinitializer @@ -242,8 +234,7 @@ define <8 x float> @test_mask_vmulps_256(<8 x float> %dst, <8 x float> %i, <8 x define <8 x float> @test_mask_vminps_256(<8 x float> %dst, <8 x float> %i, <8 x float> %j, <8 x i32> %mask1)nounwind readnone { ; CHECK-LABEL: test_mask_vminps_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; CHECK-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0x65,0x28,0x1f,0xcc,0x04] +; CHECK-NEXT: vptestmd %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0x65,0x28,0x27,0xcb] ; CHECK-NEXT: vminps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0x5d,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <8 x i32> %mask1, zeroinitializer @@ -256,8 +247,7 @@ define <8 x float> @test_mask_vminps_256(<8 x float> %dst, <8 x float> %i, <8 x define <8 x float> @test_mask_vmaxps_256(<8 x float> %dst, <8 x float> %i, <8 x float> %j, <8 x i32> %mask1) nounwind readnone { ; CHECK-LABEL: test_mask_vmaxps_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; CHECK-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0x65,0x28,0x1f,0xcc,0x04] +; CHECK-NEXT: vptestmd %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0x65,0x28,0x27,0xcb] ; CHECK-NEXT: vmaxps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0x5f,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <8 x i32> %mask1, zeroinitializer @@ -270,8 +260,7 @@ define <8 x float> @test_mask_vmaxps_256(<8 x float> %dst, <8 x float> %i, <8 x define <8 x float> @test_mask_vsubps_256(<8 x float> %dst, <8 x float> %i, <8 x float> %j, <8 x i32> %mask1) nounwind readnone { ; CHECK-LABEL: test_mask_vsubps_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; CHECK-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0x65,0x28,0x1f,0xcc,0x04] +; CHECK-NEXT: vptestmd %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0x65,0x28,0x27,0xcb] ; CHECK-NEXT: vsubps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0x5c,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <8 x i32> %mask1, zeroinitializer @@ -283,8 +272,7 @@ define <8 x float> @test_mask_vsubps_256(<8 x float> %dst, <8 x float> %i, <8 x define <8 x float> @test_mask_vdivps_256(<8 x float> %dst, <8 x float> %i, <8 x float> %j, <8 x i32> %mask1) nounwind readnone { ; CHECK-LABEL: test_mask_vdivps_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; CHECK-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0x65,0x28,0x1f,0xcc,0x04] +; CHECK-NEXT: vptestmd %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0x65,0x28,0x27,0xcb] ; CHECK-NEXT: vdivps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0x5e,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <8 x i32> %mask1, zeroinitializer @@ -296,8 +284,7 @@ define <8 x float> @test_mask_vdivps_256(<8 x float> %dst, <8 x float> %i, <8 x define <4 x double> @test_mask_vmulpd_256(<4 x double> %dst, <4 x double> %i, <4 x double> %j, <4 x i64> %mask1) nounwind readnone { ; CHECK-LABEL: test_mask_vmulpd_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; CHECK-NEXT: vpcmpneqq %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x28,0x1f,0xcc,0x04] +; CHECK-NEXT: vptestmq %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x28,0x27,0xcb] ; CHECK-NEXT: vmulpd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x59,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i64> %mask1, zeroinitializer @@ -309,8 +296,7 @@ define <4 x double> @test_mask_vmulpd_256(<4 x double> %dst, <4 x double> %i, <4 define <4 x double> @test_mask_vminpd_256(<4 x double> %dst, <4 x double> %i, <4 x double> %j, <4 x i64> %mask1) nounwind readnone { ; CHECK-LABEL: test_mask_vminpd_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; CHECK-NEXT: vpcmpneqq %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x28,0x1f,0xcc,0x04] +; CHECK-NEXT: vptestmq %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x28,0x27,0xcb] ; CHECK-NEXT: vminpd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x5d,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i64> %mask1, zeroinitializer @@ -323,8 +309,7 @@ define <4 x double> @test_mask_vminpd_256(<4 x double> %dst, <4 x double> %i, <4 define <4 x double> @test_mask_vmaxpd_256(<4 x double> %dst, <4 x double> %i, <4 x double> %j, <4 x i64> %mask1) nounwind readnone { ; CHECK-LABEL: test_mask_vmaxpd_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; CHECK-NEXT: vpcmpneqq %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x28,0x1f,0xcc,0x04] +; CHECK-NEXT: vptestmq %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x28,0x27,0xcb] ; CHECK-NEXT: vmaxpd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x5f,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i64> %mask1, zeroinitializer @@ -337,8 +322,7 @@ define <4 x double> @test_mask_vmaxpd_256(<4 x double> %dst, <4 x double> %i, <4 define <4 x double> @test_mask_vsubpd_256(<4 x double> %dst, <4 x double> %i, <4 x double> %j, <4 x i64> %mask1) nounwind readnone { ; CHECK-LABEL: test_mask_vsubpd_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; CHECK-NEXT: vpcmpneqq %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x28,0x1f,0xcc,0x04] +; CHECK-NEXT: vptestmq %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x28,0x27,0xcb] ; CHECK-NEXT: vsubpd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x5c,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i64> %mask1, zeroinitializer @@ -350,8 +334,7 @@ define <4 x double> @test_mask_vsubpd_256(<4 x double> %dst, <4 x double> %i, <4 define <4 x double> @test_mask_vdivpd_256(<4 x double> %dst, <4 x double> %i, <4 x double> %j, <4 x i64> %mask1) nounwind readnone { ; CHECK-LABEL: test_mask_vdivpd_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; CHECK-NEXT: vpcmpneqq %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x28,0x1f,0xcc,0x04] +; CHECK-NEXT: vptestmq %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x28,0x27,0xcb] ; CHECK-NEXT: vdivpd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x5e,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i64> %mask1, zeroinitializer @@ -363,8 +346,7 @@ define <4 x double> @test_mask_vdivpd_256(<4 x double> %dst, <4 x double> %i, <4 define <4 x double> @test_mask_vaddpd_256(<4 x double> %dst, <4 x double> %i, <4 x double> %j, <4 x i64> %mask1) nounwind readnone { ; CHECK-LABEL: test_mask_vaddpd_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; CHECK-NEXT: vpcmpneqq %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x28,0x1f,0xcc,0x04] +; CHECK-NEXT: vptestmq %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x28,0x27,0xcb] ; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x58,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i64> %mask1, zeroinitializer @@ -376,8 +358,7 @@ define <4 x double> @test_mask_vaddpd_256(<4 x double> %dst, <4 x double> %i, <4 define <4 x double> @test_maskz_vaddpd_256(<4 x double> %i, <4 x double> %j, <4 x i64> %mask1) nounwind readnone { ; CHECK-LABEL: test_maskz_vaddpd_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] -; CHECK-NEXT: vpcmpneqq %ymm3, %ymm2, %k1 ## encoding: [0x62,0xf3,0xed,0x28,0x1f,0xcb,0x04] +; CHECK-NEXT: vptestmq %ymm2, %ymm2, %k1 ## encoding: [0x62,0xf2,0xed,0x28,0x27,0xca] ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i64> %mask1, zeroinitializer @@ -389,8 +370,7 @@ define <4 x double> @test_maskz_vaddpd_256(<4 x double> %i, <4 x double> %j, <4 define <4 x double> @test_mask_fold_vaddpd_256(<4 x double> %dst, <4 x double> %i, <4 x double>* %j, <4 x i64> %mask1) nounwind { ; CHECK-LABEL: test_mask_fold_vaddpd_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] -; CHECK-NEXT: vpcmpneqq %ymm3, %ymm2, %k1 ## encoding: [0x62,0xf3,0xed,0x28,0x1f,0xcb,0x04] +; CHECK-NEXT: vptestmq %ymm2, %ymm2, %k1 ## encoding: [0x62,0xf2,0xed,0x28,0x27,0xca] ; CHECK-NEXT: vaddpd (%rdi), %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x58,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i64> %mask1, zeroinitializer @@ -403,8 +383,7 @@ define <4 x double> @test_mask_fold_vaddpd_256(<4 x double> %dst, <4 x double> % define <4 x double> @test_maskz_fold_vaddpd_256(<4 x double> %i, <4 x double>* %j, <4 x i64> %mask1) nounwind { ; CHECK-LABEL: test_maskz_fold_vaddpd_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04] +; CHECK-NEXT: vptestmq %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x28,0x27,0xc9] ; CHECK-NEXT: vaddpd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x58,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i64> %mask1, zeroinitializer @@ -429,8 +408,7 @@ define <4 x double> @test_broadcast2_vaddpd_256(<4 x double> %i, double* %j) nou define <4 x double> @test_mask_broadcast_vaddpd_256(<4 x double> %dst, <4 x double> %i, double* %j, <4 x i64> %mask1) nounwind { ; CHECK-LABEL: test_mask_broadcast_vaddpd_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] -; CHECK-NEXT: vpcmpneqq %ymm0, %ymm2, %k1 ## encoding: [0x62,0xf3,0xed,0x28,0x1f,0xc8,0x04] +; CHECK-NEXT: vptestmq %ymm2, %ymm2, %k1 ## encoding: [0x62,0xf2,0xed,0x28,0x27,0xca] ; CHECK-NEXT: vaddpd (%rdi){1to4}, %ymm1, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x39,0x58,0x0f] ; CHECK-NEXT: vmovapd %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] @@ -446,8 +424,7 @@ define <4 x double> @test_mask_broadcast_vaddpd_256(<4 x double> %dst, <4 x doub define <4 x double> @test_maskz_broadcast_vaddpd_256(<4 x double> %i, double* %j, <4 x i64> %mask1) nounwind { ; CHECK-LABEL: test_maskz_broadcast_vaddpd_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04] +; CHECK-NEXT: vptestmq %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x28,0x27,0xc9] ; CHECK-NEXT: vaddpd (%rdi){1to4}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xb9,0x58,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i64> %mask1, zeroinitializer @@ -524,8 +501,7 @@ define <4 x i32> @vpaddd128_broadcast_test(<4 x i32> %i) nounwind { define <4 x i32> @vpaddd128_mask_test(<4 x i32> %i, <4 x i32> %j, <4 x i32> %mask1) nounwind readnone { ; CHECK-LABEL: vpaddd128_mask_test: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] -; CHECK-NEXT: vpcmpneqd %xmm3, %xmm2, %k1 ## encoding: [0x62,0xf3,0x6d,0x08,0x1f,0xcb,0x04] +; CHECK-NEXT: vptestmd %xmm2, %xmm2, %k1 ## encoding: [0x62,0xf2,0x6d,0x08,0x27,0xca] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfe,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i32> %mask1, zeroinitializer @@ -537,8 +513,7 @@ define <4 x i32> @vpaddd128_mask_test(<4 x i32> %i, <4 x i32> %j, <4 x i32> %mas define <4 x i32> @vpaddd128_maskz_test(<4 x i32> %i, <4 x i32> %j, <4 x i32> %mask1) nounwind readnone { ; CHECK-LABEL: vpaddd128_maskz_test: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] -; CHECK-NEXT: vpcmpneqd %xmm3, %xmm2, %k1 ## encoding: [0x62,0xf3,0x6d,0x08,0x1f,0xcb,0x04] +; CHECK-NEXT: vptestmd %xmm2, %xmm2, %k1 ## encoding: [0x62,0xf2,0x6d,0x08,0x27,0xca] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfe,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i32> %mask1, zeroinitializer @@ -550,8 +525,7 @@ define <4 x i32> @vpaddd128_maskz_test(<4 x i32> %i, <4 x i32> %j, <4 x i32> %ma define <4 x i32> @vpaddd128_mask_fold_test(<4 x i32> %i, <4 x i32>* %j.ptr, <4 x i32> %mask1) nounwind readnone { ; CHECK-LABEL: vpaddd128_mask_fold_test: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04] +; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x08,0x27,0xc9] ; CHECK-NEXT: vpaddd (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfe,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i32> %mask1, zeroinitializer @@ -564,8 +538,7 @@ define <4 x i32> @vpaddd128_mask_fold_test(<4 x i32> %i, <4 x i32>* %j.ptr, <4 x define <4 x i32> @vpaddd128_mask_broadcast_test(<4 x i32> %i, <4 x i32> %mask1) nounwind readnone { ; CHECK-LABEL: vpaddd128_mask_broadcast_test: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04] +; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x08,0x27,0xc9] ; CHECK-NEXT: vpaddd {{.*}}(%rip){1to4}, %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xfe,0x05,A,A,A,A] ; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI46_0-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq ## encoding: [0xc3] @@ -578,8 +551,7 @@ define <4 x i32> @vpaddd128_mask_broadcast_test(<4 x i32> %i, <4 x i32> %mask1) define <4 x i32> @vpaddd128_maskz_fold_test(<4 x i32> %i, <4 x i32>* %j.ptr, <4 x i32> %mask1) nounwind readnone { ; CHECK-LABEL: vpaddd128_maskz_fold_test: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04] +; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x08,0x27,0xc9] ; CHECK-NEXT: vpaddd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfe,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i32> %mask1, zeroinitializer @@ -592,8 +564,7 @@ define <4 x i32> @vpaddd128_maskz_fold_test(<4 x i32> %i, <4 x i32>* %j.ptr, <4 define <4 x i32> @vpaddd128_maskz_broadcast_test(<4 x i32> %i, <4 x i32> %mask1) nounwind readnone { ; CHECK-LABEL: vpaddd128_maskz_broadcast_test: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04] +; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x08,0x27,0xc9] ; CHECK-NEXT: vpaddd {{.*}}(%rip){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xfe,0x05,A,A,A,A] ; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI48_0-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq ## encoding: [0xc3] @@ -664,8 +635,7 @@ define <4 x float> @test_broadcast_vaddpd_128(<4 x float> %a) nounwind { define <4 x float> @test_mask_vaddps_128(<4 x float> %dst, <4 x float> %i, <4 x float> %j, <4 x i32> %mask1) nounwind readnone { ; CHECK-LABEL: test_mask_vaddps_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; CHECK-NEXT: vpcmpneqd %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0x65,0x08,0x1f,0xcc,0x04] +; CHECK-NEXT: vptestmd %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0x65,0x08,0x27,0xcb] ; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x09,0x58,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i32> %mask1, zeroinitializer @@ -677,8 +647,7 @@ define <4 x float> @test_mask_vaddps_128(<4 x float> %dst, <4 x float> %i, <4 x define <4 x float> @test_mask_vmulps_128(<4 x float> %dst, <4 x float> %i, <4 x float> %j, <4 x i32> %mask1) nounwind readnone { ; CHECK-LABEL: test_mask_vmulps_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; CHECK-NEXT: vpcmpneqd %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0x65,0x08,0x1f,0xcc,0x04] +; CHECK-NEXT: vptestmd %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0x65,0x08,0x27,0xcb] ; CHECK-NEXT: vmulps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x09,0x59,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i32> %mask1, zeroinitializer @@ -690,8 +659,7 @@ define <4 x float> @test_mask_vmulps_128(<4 x float> %dst, <4 x float> %i, <4 x define <4 x float> @test_mask_vminps_128(<4 x float> %dst, <4 x float> %i, <4 x float> %j, <4 x i32> %mask1) nounwind readnone { ; CHECK-LABEL: test_mask_vminps_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; CHECK-NEXT: vpcmpneqd %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0x65,0x08,0x1f,0xcc,0x04] +; CHECK-NEXT: vptestmd %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0x65,0x08,0x27,0xcb] ; CHECK-NEXT: vminps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x09,0x5d,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i32> %mask1, zeroinitializer @@ -704,8 +672,7 @@ define <4 x float> @test_mask_vminps_128(<4 x float> %dst, <4 x float> %i, <4 x define <4 x float> @test_mask_vmaxps_128(<4 x float> %dst, <4 x float> %i, <4 x float> %j, <4 x i32> %mask1) nounwind readnone { ; CHECK-LABEL: test_mask_vmaxps_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; CHECK-NEXT: vpcmpneqd %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0x65,0x08,0x1f,0xcc,0x04] +; CHECK-NEXT: vptestmd %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0x65,0x08,0x27,0xcb] ; CHECK-NEXT: vmaxps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x09,0x5f,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i32> %mask1, zeroinitializer @@ -718,8 +685,7 @@ define <4 x float> @test_mask_vmaxps_128(<4 x float> %dst, <4 x float> %i, <4 x define <4 x float> @test_mask_vsubps_128(<4 x float> %dst, <4 x float> %i, <4 x float> %j, <4 x i32> %mask1) nounwind readnone { ; CHECK-LABEL: test_mask_vsubps_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; CHECK-NEXT: vpcmpneqd %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0x65,0x08,0x1f,0xcc,0x04] +; CHECK-NEXT: vptestmd %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0x65,0x08,0x27,0xcb] ; CHECK-NEXT: vsubps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x09,0x5c,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i32> %mask1, zeroinitializer @@ -732,8 +698,7 @@ define <4 x float> @test_mask_vsubps_128(<4 x float> %dst, <4 x float> %i, <4 x define <4 x float> @test_mask_vdivps_128(<4 x float> %dst, <4 x float> %i, <4 x float> %j, <4 x i32> %mask1) nounwind readnone { ; CHECK-LABEL: test_mask_vdivps_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; CHECK-NEXT: vpcmpneqd %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0x65,0x08,0x1f,0xcc,0x04] +; CHECK-NEXT: vptestmd %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0x65,0x08,0x27,0xcb] ; CHECK-NEXT: vdivps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x09,0x5e,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i32> %mask1, zeroinitializer @@ -745,8 +710,7 @@ define <4 x float> @test_mask_vdivps_128(<4 x float> %dst, <4 x float> %i, <4 x define <2 x double> @test_mask_vmulpd_128(<2 x double> %dst, <2 x double> %i, <2 x double> %j, <2 x i64> %mask1) nounwind readnone { ; CHECK-LABEL: test_mask_vmulpd_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; CHECK-NEXT: vpcmpneqq %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x08,0x1f,0xcc,0x04] +; CHECK-NEXT: vptestmq %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x08,0x27,0xcb] ; CHECK-NEXT: vmulpd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x59,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <2 x i64> %mask1, zeroinitializer @@ -758,8 +722,7 @@ define <2 x double> @test_mask_vmulpd_128(<2 x double> %dst, <2 x double> %i, <2 define <2 x double> @test_mask_vminpd_128(<2 x double> %dst, <2 x double> %i, <2 x double> %j, <2 x i64> %mask1) nounwind readnone { ; CHECK-LABEL: test_mask_vminpd_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; CHECK-NEXT: vpcmpneqq %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x08,0x1f,0xcc,0x04] +; CHECK-NEXT: vptestmq %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x08,0x27,0xcb] ; CHECK-NEXT: vminpd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x5d,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <2 x i64> %mask1, zeroinitializer @@ -772,8 +735,7 @@ define <2 x double> @test_mask_vminpd_128(<2 x double> %dst, <2 x double> %i, <2 define <2 x double> @test_mask_vmaxpd_128(<2 x double> %dst, <2 x double> %i, <2 x double> %j, <2 x i64> %mask1) nounwind readnone { ; CHECK-LABEL: test_mask_vmaxpd_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; CHECK-NEXT: vpcmpneqq %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x08,0x1f,0xcc,0x04] +; CHECK-NEXT: vptestmq %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x08,0x27,0xcb] ; CHECK-NEXT: vmaxpd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x5f,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <2 x i64> %mask1, zeroinitializer @@ -786,8 +748,7 @@ define <2 x double> @test_mask_vmaxpd_128(<2 x double> %dst, <2 x double> %i, <2 define <2 x double> @test_mask_vsubpd_128(<2 x double> %dst, <2 x double> %i, <2 x double> %j, <2 x i64> %mask1) nounwind readnone { ; CHECK-LABEL: test_mask_vsubpd_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; CHECK-NEXT: vpcmpneqq %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x08,0x1f,0xcc,0x04] +; CHECK-NEXT: vptestmq %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x08,0x27,0xcb] ; CHECK-NEXT: vsubpd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x5c,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <2 x i64> %mask1, zeroinitializer @@ -799,8 +760,7 @@ define <2 x double> @test_mask_vsubpd_128(<2 x double> %dst, <2 x double> %i, <2 define <2 x double> @test_mask_vdivpd_128(<2 x double> %dst, <2 x double> %i, <2 x double> %j, <2 x i64> %mask1) nounwind readnone { ; CHECK-LABEL: test_mask_vdivpd_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; CHECK-NEXT: vpcmpneqq %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x08,0x1f,0xcc,0x04] +; CHECK-NEXT: vptestmq %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x08,0x27,0xcb] ; CHECK-NEXT: vdivpd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x5e,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <2 x i64> %mask1, zeroinitializer @@ -812,8 +772,7 @@ define <2 x double> @test_mask_vdivpd_128(<2 x double> %dst, <2 x double> %i, <2 define <2 x double> @test_mask_vaddpd_128(<2 x double> %dst, <2 x double> %i, <2 x double> %j, <2 x i64> %mask1) nounwind readnone { ; CHECK-LABEL: test_mask_vaddpd_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; CHECK-NEXT: vpcmpneqq %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x08,0x1f,0xcc,0x04] +; CHECK-NEXT: vptestmq %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x08,0x27,0xcb] ; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x58,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <2 x i64> %mask1, zeroinitializer @@ -825,8 +784,7 @@ define <2 x double> @test_mask_vaddpd_128(<2 x double> %dst, <2 x double> %i, <2 define <2 x double> @test_maskz_vaddpd_128(<2 x double> %i, <2 x double> %j, ; CHECK-LABEL: test_maskz_vaddpd_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] -; CHECK-NEXT: vpcmpneqq %xmm3, %xmm2, %k1 ## encoding: [0x62,0xf3,0xed,0x08,0x1f,0xcb,0x04] +; CHECK-NEXT: vptestmq %xmm2, %xmm2, %k1 ## encoding: [0x62,0xf2,0xed,0x08,0x27,0xca] ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] <2 x i64> %mask1) nounwind readnone { @@ -839,8 +797,7 @@ define <2 x double> @test_maskz_vaddpd_128(<2 x double> %i, <2 x double> %j, define <2 x double> @test_mask_fold_vaddpd_128(<2 x double> %dst, <2 x double> %i, <2 x double>* %j, <2 x i64> %mask1) nounwind { ; CHECK-LABEL: test_mask_fold_vaddpd_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] -; CHECK-NEXT: vpcmpneqq %xmm3, %xmm2, %k1 ## encoding: [0x62,0xf3,0xed,0x08,0x1f,0xcb,0x04] +; CHECK-NEXT: vptestmq %xmm2, %xmm2, %k1 ## encoding: [0x62,0xf2,0xed,0x08,0x27,0xca] ; CHECK-NEXT: vaddpd (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x58,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <2 x i64> %mask1, zeroinitializer @@ -853,8 +810,7 @@ define <2 x double> @test_mask_fold_vaddpd_128(<2 x double> %dst, <2 x double> % define <2 x double> @test_maskz_fold_vaddpd_128(<2 x double> %i, <2 x double>* %j, <2 x i64> %mask1) nounwind { ; CHECK-LABEL: test_maskz_fold_vaddpd_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04] +; CHECK-NEXT: vptestmq %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x08,0x27,0xc9] ; CHECK-NEXT: vaddpd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x58,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <2 x i64> %mask1, zeroinitializer @@ -879,8 +835,7 @@ define <2 x double> @test_broadcast2_vaddpd_128(<2 x double> %i, double* %j) nou define <2 x double> @test_mask_broadcast_vaddpd_128(<2 x double> %dst, <2 x double> %i, double* %j, <2 x i64> %mask1) nounwind { ; CHECK-LABEL: test_mask_broadcast_vaddpd_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] -; CHECK-NEXT: vpcmpneqq %xmm0, %xmm2, %k1 ## encoding: [0x62,0xf3,0xed,0x08,0x1f,0xc8,0x04] +; CHECK-NEXT: vptestmq %xmm2, %xmm2, %k1 ## encoding: [0x62,0xf2,0xed,0x08,0x27,0xca] ; CHECK-NEXT: vaddpd (%rdi){1to2}, %xmm1, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x19,0x58,0x0f] ; CHECK-NEXT: vmovapd %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] @@ -896,8 +851,7 @@ define <2 x double> @test_mask_broadcast_vaddpd_128(<2 x double> %dst, <2 x doub define <2 x double> @test_maskz_broadcast_vaddpd_128(<2 x double> %i, double* %j, <2 x i64> %mask1) nounwind { ; CHECK-LABEL: test_maskz_broadcast_vaddpd_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04] +; CHECK-NEXT: vptestmq %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x08,0x27,0xc9] ; CHECK-NEXT: vaddpd (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x99,0x58,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <2 x i64> %mask1, zeroinitializer diff --git a/llvm/test/CodeGen/X86/avx512vl-mov.ll b/llvm/test/CodeGen/X86/avx512vl-mov.ll index f0ce312305f..90d9ff3250d 100644 --- a/llvm/test/CodeGen/X86/avx512vl-mov.ll +++ b/llvm/test/CodeGen/X86/avx512vl-mov.ll @@ -164,8 +164,7 @@ define <8 x float> @test_256_16(i8 * %addr) { define <8 x i32> @test_256_17(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) { ; CHECK-LABEL: test_256_17: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04] +; CHECK-NEXT: vptestmd %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0x75,0x28,0x27,0xc9] ; CHECK-NEXT: vmovdqa32 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <8 x i32> %mask1, zeroinitializer @@ -178,8 +177,7 @@ define <8 x i32> @test_256_17(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) { define <8 x i32> @test_256_18(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) { ; CHECK-LABEL: test_256_18: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04] +; CHECK-NEXT: vptestmd %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0x75,0x28,0x27,0xc9] ; CHECK-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <8 x i32> %mask1, zeroinitializer @@ -192,8 +190,7 @@ define <8 x i32> @test_256_18(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) { define <8 x i32> @test_256_19(i8 * %addr, <8 x i32> %mask1) { ; CHECK-LABEL: test_256_19: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] -; CHECK-NEXT: vpcmpneqd %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc9,0x04] +; CHECK-NEXT: vptestmd %ymm0, %ymm0, %k1 ## encoding: [0x62,0xf2,0x7d,0x28,0x27,0xc8] ; CHECK-NEXT: vmovdqa32 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <8 x i32> %mask1, zeroinitializer @@ -206,8 +203,7 @@ define <8 x i32> @test_256_19(i8 * %addr, <8 x i32> %mask1) { define <8 x i32> @test_256_20(i8 * %addr, <8 x i32> %mask1) { ; CHECK-LABEL: test_256_20: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] -; CHECK-NEXT: vpcmpneqd %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc9,0x04] +; CHECK-NEXT: vptestmd %ymm0, %ymm0, %k1 ## encoding: [0x62,0xf2,0x7d,0x28,0x27,0xc8] ; CHECK-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <8 x i32> %mask1, zeroinitializer @@ -220,8 +216,7 @@ define <8 x i32> @test_256_20(i8 * %addr, <8 x i32> %mask1) { define <4 x i64> @test_256_21(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) { ; CHECK-LABEL: test_256_21: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04] +; CHECK-NEXT: vptestmq %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x28,0x27,0xc9] ; CHECK-NEXT: vmovdqa64 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i64> %mask1, zeroinitializer @@ -234,8 +229,7 @@ define <4 x i64> @test_256_21(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) { define <4 x i64> @test_256_22(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) { ; CHECK-LABEL: test_256_22: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04] +; CHECK-NEXT: vptestmq %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x28,0x27,0xc9] ; CHECK-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x29,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i64> %mask1, zeroinitializer @@ -248,8 +242,7 @@ define <4 x i64> @test_256_22(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) { define <4 x i64> @test_256_23(i8 * %addr, <4 x i64> %mask1) { ; CHECK-LABEL: test_256_23: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] -; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc9,0x04] +; CHECK-NEXT: vptestmq %ymm0, %ymm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x28,0x27,0xc8] ; CHECK-NEXT: vmovdqa64 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i64> %mask1, zeroinitializer @@ -262,8 +255,7 @@ define <4 x i64> @test_256_23(i8 * %addr, <4 x i64> %mask1) { define <4 x i64> @test_256_24(i8 * %addr, <4 x i64> %mask1) { ; CHECK-LABEL: test_256_24: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] -; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc9,0x04] +; CHECK-NEXT: vptestmq %ymm0, %ymm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x28,0x27,0xc8] ; CHECK-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfe,0xa9,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i64> %mask1, zeroinitializer @@ -332,8 +324,7 @@ define <8 x float> @test_256_28(i8 * %addr, <8 x float> %mask1) { define <4 x double> @test_256_29(i8 * %addr, <4 x double> %old, <4 x i64> %mask1) { ; CHECK-LABEL: test_256_29: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04] +; CHECK-NEXT: vptestmq %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x28,0x27,0xc9] ; CHECK-NEXT: vmovapd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x28,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i64> %mask1, zeroinitializer @@ -346,8 +337,7 @@ define <4 x double> @test_256_29(i8 * %addr, <4 x double> %old, <4 x i64> %mask1 define <4 x double> @test_256_30(i8 * %addr, <4 x double> %old, <4 x i64> %mask1) { ; CHECK-LABEL: test_256_30: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04] +; CHECK-NEXT: vptestmq %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x28,0x27,0xc9] ; CHECK-NEXT: vmovupd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i64> %mask1, zeroinitializer @@ -360,8 +350,7 @@ define <4 x double> @test_256_30(i8 * %addr, <4 x double> %old, <4 x i64> %mask1 define <4 x double> @test_256_31(i8 * %addr, <4 x i64> %mask1) { ; CHECK-LABEL: test_256_31: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] -; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc9,0x04] +; CHECK-NEXT: vptestmq %ymm0, %ymm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x28,0x27,0xc8] ; CHECK-NEXT: vmovapd (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x28,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i64> %mask1, zeroinitializer @@ -374,8 +363,7 @@ define <4 x double> @test_256_31(i8 * %addr, <4 x i64> %mask1) { define <4 x double> @test_256_32(i8 * %addr, <4 x i64> %mask1) { ; CHECK-LABEL: test_256_32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] -; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc9,0x04] +; CHECK-NEXT: vptestmq %ymm0, %ymm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x28,0x27,0xc8] ; CHECK-NEXT: vmovupd (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i64> %mask1, zeroinitializer @@ -548,8 +536,7 @@ define <4 x float> @test_128_16(i8 * %addr) { define <4 x i32> @test_128_17(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) { ; CHECK-LABEL: test_128_17: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04] +; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x08,0x27,0xc9] ; CHECK-NEXT: vmovdqa32 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i32> %mask1, zeroinitializer @@ -562,8 +549,7 @@ define <4 x i32> @test_128_17(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) { define <4 x i32> @test_128_18(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) { ; CHECK-LABEL: test_128_18: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04] +; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x08,0x27,0xc9] ; CHECK-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i32> %mask1, zeroinitializer @@ -576,8 +562,7 @@ define <4 x i32> @test_128_18(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) { define <4 x i32> @test_128_19(i8 * %addr, <4 x i32> %mask1) { ; CHECK-LABEL: test_128_19: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] -; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc9,0x04] +; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1 ## encoding: [0x62,0xf2,0x7d,0x08,0x27,0xc8] ; CHECK-NEXT: vmovdqa32 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i32> %mask1, zeroinitializer @@ -590,8 +575,7 @@ define <4 x i32> @test_128_19(i8 * %addr, <4 x i32> %mask1) { define <4 x i32> @test_128_20(i8 * %addr, <4 x i32> %mask1) { ; CHECK-LABEL: test_128_20: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] -; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc9,0x04] +; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1 ## encoding: [0x62,0xf2,0x7d,0x08,0x27,0xc8] ; CHECK-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i32> %mask1, zeroinitializer @@ -604,8 +588,7 @@ define <4 x i32> @test_128_20(i8 * %addr, <4 x i32> %mask1) { define <2 x i64> @test_128_21(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) { ; CHECK-LABEL: test_128_21: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04] +; CHECK-NEXT: vptestmq %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x08,0x27,0xc9] ; CHECK-NEXT: vmovdqa64 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <2 x i64> %mask1, zeroinitializer @@ -618,8 +601,7 @@ define <2 x i64> @test_128_21(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) { define <2 x i64> @test_128_22(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) { ; CHECK-LABEL: test_128_22: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04] +; CHECK-NEXT: vptestmq %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x08,0x27,0xc9] ; CHECK-NEXT: vmovdqu64 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x09,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <2 x i64> %mask1, zeroinitializer @@ -632,8 +614,7 @@ define <2 x i64> @test_128_22(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) { define <2 x i64> @test_128_23(i8 * %addr, <2 x i64> %mask1) { ; CHECK-LABEL: test_128_23: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] -; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc9,0x04] +; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x08,0x27,0xc8] ; CHECK-NEXT: vmovdqa64 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <2 x i64> %mask1, zeroinitializer @@ -646,8 +627,7 @@ define <2 x i64> @test_128_23(i8 * %addr, <2 x i64> %mask1) { define <2 x i64> @test_128_24(i8 * %addr, <2 x i64> %mask1) { ; CHECK-LABEL: test_128_24: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] -; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc9,0x04] +; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x08,0x27,0xc8] ; CHECK-NEXT: vmovdqu64 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfe,0x89,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <2 x i64> %mask1, zeroinitializer @@ -660,8 +640,7 @@ define <2 x i64> @test_128_24(i8 * %addr, <2 x i64> %mask1) { define <4 x float> @test_128_25(i8 * %addr, <4 x float> %old, <4 x i32> %mask1) { ; CHECK-LABEL: test_128_25: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04] +; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x08,0x27,0xc9] ; CHECK-NEXT: vmovaps (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x28,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i32> %mask1, zeroinitializer @@ -674,8 +653,7 @@ define <4 x float> @test_128_25(i8 * %addr, <4 x float> %old, <4 x i32> %mask1) define <4 x float> @test_128_26(i8 * %addr, <4 x float> %old, <4 x i32> %mask1) { ; CHECK-LABEL: test_128_26: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04] +; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x08,0x27,0xc9] ; CHECK-NEXT: vmovups (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i32> %mask1, zeroinitializer @@ -688,8 +666,7 @@ define <4 x float> @test_128_26(i8 * %addr, <4 x float> %old, <4 x i32> %mask1) define <4 x float> @test_128_27(i8 * %addr, <4 x i32> %mask1) { ; CHECK-LABEL: test_128_27: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] -; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc9,0x04] +; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1 ## encoding: [0x62,0xf2,0x7d,0x08,0x27,0xc8] ; CHECK-NEXT: vmovaps (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x28,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i32> %mask1, zeroinitializer @@ -702,8 +679,7 @@ define <4 x float> @test_128_27(i8 * %addr, <4 x i32> %mask1) { define <4 x float> @test_128_28(i8 * %addr, <4 x i32> %mask1) { ; CHECK-LABEL: test_128_28: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] -; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc9,0x04] +; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1 ## encoding: [0x62,0xf2,0x7d,0x08,0x27,0xc8] ; CHECK-NEXT: vmovups (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i32> %mask1, zeroinitializer @@ -716,8 +692,7 @@ define <4 x float> @test_128_28(i8 * %addr, <4 x i32> %mask1) { define <2 x double> @test_128_29(i8 * %addr, <2 x double> %old, <2 x i64> %mask1) { ; CHECK-LABEL: test_128_29: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04] +; CHECK-NEXT: vptestmq %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x08,0x27,0xc9] ; CHECK-NEXT: vmovapd (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x28,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <2 x i64> %mask1, zeroinitializer @@ -730,8 +705,7 @@ define <2 x double> @test_128_29(i8 * %addr, <2 x double> %old, <2 x i64> %mask1 define <2 x double> @test_128_30(i8 * %addr, <2 x double> %old, <2 x i64> %mask1) { ; CHECK-LABEL: test_128_30: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] -; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04] +; CHECK-NEXT: vptestmq %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x08,0x27,0xc9] ; CHECK-NEXT: vmovupd (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <2 x i64> %mask1, zeroinitializer @@ -744,8 +718,7 @@ define <2 x double> @test_128_30(i8 * %addr, <2 x double> %old, <2 x i64> %mask1 define <2 x double> @test_128_31(i8 * %addr, <2 x i64> %mask1) { ; CHECK-LABEL: test_128_31: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] -; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc9,0x04] +; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x08,0x27,0xc8] ; CHECK-NEXT: vmovapd (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x28,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <2 x i64> %mask1, zeroinitializer @@ -758,8 +731,7 @@ define <2 x double> @test_128_31(i8 * %addr, <2 x i64> %mask1) { define <2 x double> @test_128_32(i8 * %addr, <2 x i64> %mask1) { ; CHECK-LABEL: test_128_32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] -; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc9,0x04] +; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x08,0x27,0xc8] ; CHECK-NEXT: vmovupd (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <2 x i64> %mask1, zeroinitializer diff --git a/llvm/test/CodeGen/X86/avx512vl-vbroadcast.ll b/llvm/test/CodeGen/X86/avx512vl-vbroadcast.ll index 7d24b8161e5..97fa973127b 100644 --- a/llvm/test/CodeGen/X86/avx512vl-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx512vl-vbroadcast.ll @@ -70,8 +70,7 @@ define <8 x float> @_inreg8xfloat(float %a) { define <8 x float> @_ss8xfloat_mask(<8 x float> %i, float %a, <8 x i32> %mask1) { ; CHECK-LABEL: _ss8xfloat_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpneqd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vptestmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vbroadcastss %xmm1, %ymm0 {%k1} ; CHECK-NEXT: retq %mask = icmp ne <8 x i32> %mask1, zeroinitializer @@ -84,8 +83,7 @@ define <8 x float> @_ss8xfloat_mask(<8 x float> %i, float %a, <8 x i32> %mask1 define <8 x float> @_ss8xfloat_maskz(float %a, <8 x i32> %mask1) { ; CHECK-LABEL: _ss8xfloat_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vptestmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %mask = icmp ne <8 x i32> %mask1, zeroinitializer @@ -108,8 +106,7 @@ define <4 x float> @_inreg4xfloat(float %a) { define <4 x float> @_ss4xfloat_mask(<4 x float> %i, float %a, <4 x i32> %mask1) { ; CHECK-LABEL: _ss4xfloat_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpneqd %xmm3, %xmm2, %k1 +; CHECK-NEXT: vptestmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vbroadcastss %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp ne <4 x i32> %mask1, zeroinitializer @@ -122,8 +119,7 @@ define <4 x float> @_ss4xfloat_mask(<4 x float> %i, float %a, <4 x i32> %mask1 define <4 x float> @_ss4xfloat_maskz(float %a, <4 x i32> %mask1) { ; CHECK-LABEL: _ss4xfloat_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %mask = icmp ne <4 x i32> %mask1, zeroinitializer @@ -146,8 +142,7 @@ define <4 x double> @_inreg4xdouble(double %a) { define <4 x double> @_ss4xdouble_mask(<4 x double> %i, double %a, <4 x i32> %mask1) { ; CHECK-LABEL: _ss4xdouble_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpneqd %xmm3, %xmm2, %k1 +; CHECK-NEXT: vptestmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1} ; CHECK-NEXT: retq %mask = icmp ne <4 x i32> %mask1, zeroinitializer @@ -160,8 +155,7 @@ define <4 x double> @_ss4xdouble_mask(<4 x double> %i, double %a, <4 x i32> %m define <4 x double> @_ss4xdouble_maskz(double %a, <4 x i32> %mask1) { ; CHECK-LABEL: _ss4xdouble_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %mask = icmp ne <4 x i32> %mask1, zeroinitializer @@ -185,8 +179,7 @@ define <2 x double> @test_v2f64_broadcast_fold(<2 x double> *%a0, <2 x double> % define <2 x double> @test_v2f64_broadcast_fold_mask(<2 x double> *%a0, <2 x double> %a1, <2 x i64> %mask1, <2 x double> %a2) { ; CHECK-LABEL: test_v2f64_broadcast_fold_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpneqq %xmm3, %xmm1, %k1 +; CHECK-NEXT: vptestmq %xmm1, %xmm1, %k1 ; CHECK-NEXT: vaddpd (%rdi){1to2}, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vmovapd %xmm2, %xmm0 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/compress_expand.ll b/llvm/test/CodeGen/X86/compress_expand.ll index fb550be6310..57767e23e3d 100644 --- a/llvm/test/CodeGen/X86/compress_expand.ll +++ b/llvm/test/CodeGen/X86/compress_expand.ll @@ -256,7 +256,7 @@ define <2 x float> @test13(float* %base, <2 x float> %src0, <2 x i32> %trigger) ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; SKX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 +; SKX-NEXT: vptestnmq %xmm1, %xmm1, %k1 ; SKX-NEXT: vexpandps (%rdi), %xmm0 {%k1} ; SKX-NEXT: retq ; @@ -265,7 +265,7 @@ define <2 x float> @test13(float* %base, <2 x float> %src0, <2 x i32> %trigger) ; KNL-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0 ; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; KNL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; KNL-NEXT: vpcmpeqq %zmm2, %zmm1, %k0 +; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0 ; KNL-NEXT: kshiftlw $14, %k0, %k0 ; KNL-NEXT: kshiftrw $14, %k0, %k1 ; KNL-NEXT: vexpandps (%rdi), %zmm0 {%k1} @@ -281,7 +281,7 @@ define void @test14(float* %base, <2 x float> %V, <2 x i32> %trigger) { ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; SKX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 +; SKX-NEXT: vptestnmq %xmm1, %xmm1, %k1 ; SKX-NEXT: vcompressps %xmm0, (%rdi) {%k1} ; SKX-NEXT: retq ; @@ -290,7 +290,7 @@ define void @test14(float* %base, <2 x float> %V, <2 x i32> %trigger) { ; KNL-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0 ; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; KNL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; KNL-NEXT: vpcmpeqq %zmm2, %zmm1, %k0 +; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0 ; KNL-NEXT: kshiftlw $14, %k0, %k0 ; KNL-NEXT: kshiftrw $14, %k0, %k1 ; KNL-NEXT: vcompressps %zmm0, (%rdi) {%k1} @@ -303,9 +303,8 @@ define void @test14(float* %base, <2 x float> %V, <2 x i32> %trigger) { define <32 x float> @test15(float* %base, <32 x float> %src0, <32 x i32> %trigger) { ; ALL-LABEL: test15: ; ALL: # %bb.0: -; ALL-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; ALL-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 -; ALL-NEXT: vpcmpeqd %zmm4, %zmm2, %k2 +; ALL-NEXT: vptestnmd %zmm3, %zmm3, %k1 +; ALL-NEXT: vptestnmd %zmm2, %zmm2, %k2 ; ALL-NEXT: kmovw %k2, %eax ; ALL-NEXT: popcntl %eax, %eax ; ALL-NEXT: vexpandps (%rdi,%rax,4), %zmm1 {%k1} @@ -320,9 +319,8 @@ define <16 x double> @test16(double* %base, <16 x double> %src0, <16 x i32> %tri ; SKX-LABEL: test16: ; SKX: # %bb.0: ; SKX-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 -; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k2 +; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 +; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k2 ; SKX-NEXT: kmovb %k2, %eax ; SKX-NEXT: popcntl %eax, %eax ; SKX-NEXT: vexpandpd (%rdi,%rax,8), %zmm1 {%k1} @@ -331,10 +329,9 @@ define <16 x double> @test16(double* %base, <16 x double> %src0, <16 x i32> %tri ; ; KNL-LABEL: test16: ; KNL: # %bb.0: -; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; KNL-NEXT: vextracti64x4 $1, %zmm2, %ymm4 -; KNL-NEXT: vpcmpeqd %zmm3, %zmm4, %k1 -; KNL-NEXT: vpcmpeqd %zmm3, %zmm2, %k2 +; KNL-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; KNL-NEXT: vptestnmd %zmm3, %zmm3, %k1 +; KNL-NEXT: vptestnmd %zmm2, %zmm2, %k2 ; KNL-NEXT: vexpandpd (%rdi), %zmm0 {%k2} ; KNL-NEXT: kmovw %k2, %eax ; KNL-NEXT: movzbl %al, %eax @@ -349,9 +346,8 @@ define <16 x double> @test16(double* %base, <16 x double> %src0, <16 x i32> %tri define void @test17(float* %base, <32 x float> %V, <32 x i32> %trigger) { ; SKX-LABEL: test17: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 -; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k2 +; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 +; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k2 ; SKX-NEXT: kmovw %k2, %eax ; SKX-NEXT: popcntl %eax, %eax ; SKX-NEXT: vcompressps %zmm1, (%rdi,%rax,4) {%k1} @@ -361,9 +357,8 @@ define void @test17(float* %base, <32 x float> %V, <32 x i32> %trigger) { ; ; KNL-LABEL: test17: ; KNL: # %bb.0: -; KNL-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; KNL-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 -; KNL-NEXT: vpcmpeqd %zmm4, %zmm2, %k2 +; KNL-NEXT: vptestnmd %zmm3, %zmm3, %k1 +; KNL-NEXT: vptestnmd %zmm2, %zmm2, %k2 ; KNL-NEXT: kmovw %k2, %eax ; KNL-NEXT: popcntl %eax, %eax ; KNL-NEXT: vcompressps %zmm1, (%rdi,%rax,4) {%k1} diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index 941fdc6c15b..574f271e4a4 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -2769,10 +2769,9 @@ define <16 x double> @test_gather_setcc_split(double* %base, <16 x i32> %ind, <1 ; KNL_64-LABEL: test_gather_setcc_split: ; KNL_64: # %bb.0: ; KNL_64-NEXT: vextractf64x4 $1, %zmm0, %ymm4 -; KNL_64-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; KNL_64-NEXT: vextracti64x4 $1, %zmm1, %ymm6 -; KNL_64-NEXT: vpcmpeqd %zmm5, %zmm6, %k1 -; KNL_64-NEXT: vpcmpeqd %zmm5, %zmm1, %k2 +; KNL_64-NEXT: vextracti64x4 $1, %zmm1, %ymm5 +; KNL_64-NEXT: vptestnmd %zmm5, %zmm5, %k1 +; KNL_64-NEXT: vptestnmd %zmm1, %zmm1, %k2 ; KNL_64-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k2} ; KNL_64-NEXT: vgatherdpd (%rdi,%ymm4,8), %zmm3 {%k1} ; KNL_64-NEXT: vmovapd %zmm2, %zmm0 @@ -2791,10 +2790,9 @@ define <16 x double> @test_gather_setcc_split(double* %base, <16 x i32> %ind, <1 ; KNL_32-NEXT: vmovapd 72(%ebp), %zmm3 ; KNL_32-NEXT: movl 8(%ebp), %eax ; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm4 -; KNL_32-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; KNL_32-NEXT: vextracti64x4 $1, %zmm1, %ymm6 -; KNL_32-NEXT: vpcmpeqd %zmm5, %zmm6, %k1 -; KNL_32-NEXT: vpcmpeqd %zmm5, %zmm1, %k2 +; KNL_32-NEXT: vextracti64x4 $1, %zmm1, %ymm5 +; KNL_32-NEXT: vptestnmd %zmm5, %zmm5, %k1 +; KNL_32-NEXT: vptestnmd %zmm1, %zmm1, %k2 ; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k2} ; KNL_32-NEXT: vgatherdpd (%eax,%ymm4,8), %zmm3 {%k1} ; KNL_32-NEXT: vmovapd %zmm2, %zmm0 @@ -2807,9 +2805,8 @@ define <16 x double> @test_gather_setcc_split(double* %base, <16 x i32> %ind, <1 ; SKX: # %bb.0: ; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm4 ; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm5 -; SKX-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; SKX-NEXT: vpcmpeqd %ymm6, %ymm5, %k1 -; SKX-NEXT: vpcmpeqd %ymm6, %ymm1, %k2 +; SKX-NEXT: vptestnmd %ymm5, %ymm5, %k1 +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k2 ; SKX-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k2} ; SKX-NEXT: vgatherdpd (%rdi,%ymm4,8), %zmm3 {%k1} ; SKX-NEXT: vmovapd %zmm2, %zmm0 @@ -2829,9 +2826,8 @@ define <16 x double> @test_gather_setcc_split(double* %base, <16 x i32> %ind, <1 ; SKX_32-NEXT: movl 8(%ebp), %eax ; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm4 ; SKX_32-NEXT: vextracti64x4 $1, %zmm1, %ymm5 -; SKX_32-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; SKX_32-NEXT: vpcmpeqd %ymm6, %ymm5, %k1 -; SKX_32-NEXT: vpcmpeqd %ymm6, %ymm1, %k2 +; SKX_32-NEXT: vptestnmd %ymm5, %ymm5, %k1 +; SKX_32-NEXT: vptestnmd %ymm1, %ymm1, %k2 ; SKX_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k2} ; SKX_32-NEXT: vgatherdpd (%eax,%ymm4,8), %zmm3 {%k1} ; SKX_32-NEXT: vmovapd %zmm2, %zmm0 @@ -2851,10 +2847,9 @@ define void @test_scatter_setcc_split(double* %base, <16 x i32> %ind, <16 x i32> ; KNL_64-LABEL: test_scatter_setcc_split: ; KNL_64: # %bb.0: ; KNL_64-NEXT: vextractf64x4 $1, %zmm0, %ymm4 -; KNL_64-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; KNL_64-NEXT: vpcmpeqd %zmm5, %zmm1, %k1 +; KNL_64-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; KNL_64-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; KNL_64-NEXT: vpcmpeqd %zmm5, %zmm1, %k2 +; KNL_64-NEXT: vptestnmd %zmm1, %zmm1, %k2 ; KNL_64-NEXT: vscatterdpd %zmm3, (%rdi,%ymm4,8) {%k2} ; KNL_64-NEXT: vscatterdpd %zmm2, (%rdi,%ymm0,8) {%k1} ; KNL_64-NEXT: vzeroupper @@ -2872,10 +2867,9 @@ define void @test_scatter_setcc_split(double* %base, <16 x i32> %ind, <16 x i32> ; KNL_32-NEXT: vmovapd 72(%ebp), %zmm3 ; KNL_32-NEXT: movl 8(%ebp), %eax ; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm4 -; KNL_32-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; KNL_32-NEXT: vpcmpeqd %zmm5, %zmm1, %k1 +; KNL_32-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; KNL_32-NEXT: vpcmpeqd %zmm5, %zmm1, %k2 +; KNL_32-NEXT: vptestnmd %zmm1, %zmm1, %k2 ; KNL_32-NEXT: vscatterdpd %zmm3, (%eax,%ymm4,8) {%k2} ; KNL_32-NEXT: vscatterdpd %zmm2, (%eax,%ymm0,8) {%k1} ; KNL_32-NEXT: movl %ebp, %esp @@ -2886,10 +2880,9 @@ define void @test_scatter_setcc_split(double* %base, <16 x i32> %ind, <16 x i32> ; SKX-LABEL: test_scatter_setcc_split: ; SKX: # %bb.0: ; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm4 -; SKX-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; SKX-NEXT: vpcmpeqd %ymm5, %ymm1, %k1 +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; SKX-NEXT: vpcmpeqd %ymm5, %ymm1, %k2 +; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k2 ; SKX-NEXT: vscatterdpd %zmm3, (%rdi,%ymm4,8) {%k2} ; SKX-NEXT: vscatterdpd %zmm2, (%rdi,%ymm0,8) {%k1} ; SKX-NEXT: vzeroupper @@ -2907,10 +2900,9 @@ define void @test_scatter_setcc_split(double* %base, <16 x i32> %ind, <16 x i32> ; SKX_32-NEXT: vmovapd 72(%ebp), %zmm3 ; SKX_32-NEXT: movl 8(%ebp), %eax ; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm4 -; SKX_32-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; SKX_32-NEXT: vpcmpeqd %ymm5, %ymm1, %k1 +; SKX_32-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; SKX_32-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; SKX_32-NEXT: vpcmpeqd %ymm5, %ymm1, %k2 +; SKX_32-NEXT: vptestnmd %ymm1, %ymm1, %k2 ; SKX_32-NEXT: vscatterdpd %zmm3, (%eax,%ymm4,8) {%k2} ; SKX_32-NEXT: vscatterdpd %zmm2, (%eax,%ymm0,8) {%k1} ; SKX_32-NEXT: movl %ebp, %esp diff --git a/llvm/test/CodeGen/X86/masked_memop.ll b/llvm/test/CodeGen/X86/masked_memop.ll index 80dabcdd123..cd28147878c 100644 --- a/llvm/test/CodeGen/X86/masked_memop.ll +++ b/llvm/test/CodeGen/X86/masked_memop.ll @@ -101,8 +101,7 @@ define <2 x double> @test6(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def %xmm1 killed %xmm1 def %zmm1 ; AVX512F-NEXT: ## kill: def %xmm0 killed %xmm0 def %zmm0 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k0 +; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 ; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1} @@ -112,8 +111,7 @@ define <2 x double> @test6(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> ; ; SKX-LABEL: test6: ; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1 +; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1 ; SKX-NEXT: vblendmpd (%rdi), %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq %mask = icmp eq <2 x i64> %trigger, zeroinitializer @@ -134,8 +132,7 @@ define <4 x float> @test7(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %d ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def %xmm1 killed %xmm1 def %zmm1 ; AVX512F-NEXT: ## kill: def %xmm0 killed %xmm0 def %zmm0 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1 ; AVX512F-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1} @@ -145,8 +142,7 @@ define <4 x float> @test7(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %d ; ; SKX-LABEL: test7: ; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpcmpeqd %xmm2, %xmm0, %k1 +; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; SKX-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer @@ -175,8 +171,7 @@ define <4 x i32> @test8(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) { ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def %xmm1 killed %xmm1 def %zmm1 ; AVX512F-NEXT: ## kill: def %xmm0 killed %xmm0 def %zmm0 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1 ; AVX512F-NEXT: vpblendmd (%rdi), %zmm1, %zmm0 {%k1} @@ -186,8 +181,7 @@ define <4 x i32> @test8(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) { ; ; SKX-LABEL: test8: ; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpcmpeqd %xmm2, %xmm0, %k1 +; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; SKX-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer @@ -214,8 +208,7 @@ define void @test9(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) { ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def %xmm1 killed %xmm1 def %zmm1 ; AVX512F-NEXT: ## kill: def %xmm0 killed %xmm0 def %zmm0 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1 ; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} @@ -224,8 +217,7 @@ define void @test9(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) { ; ; SKX-LABEL: test9: ; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpcmpeqd %xmm2, %xmm0, %k1 +; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; SKX-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1} ; SKX-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer @@ -259,8 +251,7 @@ define <4 x double> @test10(<4 x i32> %trigger, <4 x double>* %addr, <4 x double ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def %ymm1 killed %ymm1 def %zmm1 ; AVX512F-NEXT: ## kill: def %xmm0 killed %xmm0 def %zmm0 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1 ; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1} @@ -269,8 +260,7 @@ define <4 x double> @test10(<4 x i32> %trigger, <4 x double>* %addr, <4 x double ; ; SKX-LABEL: test10: ; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpcmpeqd %xmm2, %xmm0, %k1 +; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; SKX-NEXT: vblendmpd (%rdi), %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer @@ -301,8 +291,7 @@ define <4 x double> @test10b(<4 x i32> %trigger, <4 x double>* %addr, <4 x doubl ; AVX512F-LABEL: test10b: ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def %xmm0 killed %xmm0 def %zmm0 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1 ; AVX512F-NEXT: vmovupd (%rdi), %zmm0 {%k1} {z} @@ -311,8 +300,7 @@ define <4 x double> @test10b(<4 x i32> %trigger, <4 x double>* %addr, <4 x doubl ; ; SKX-LABEL: test10b: ; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; SKX-NEXT: vmovapd (%rdi), %ymm0 {%k1} {z} ; SKX-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer @@ -344,8 +332,7 @@ define <8 x float> @test11a(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def %ymm1 killed %ymm1 def %zmm1 ; AVX512F-NEXT: ## kill: def %ymm0 killed %ymm0 def %zmm0 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $8, %k0, %k0 ; AVX512F-NEXT: kshiftrw $8, %k0, %k1 ; AVX512F-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1} @@ -354,8 +341,7 @@ define <8 x float> @test11a(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> ; ; SKX-LABEL: test11a: ; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 +; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; SKX-NEXT: vblendmps (%rdi), %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq %mask = icmp eq <8 x i32> %trigger, zeroinitializer @@ -510,8 +496,7 @@ define void @test12(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %val) { ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def %ymm1 killed %ymm1 def %zmm1 ; AVX512F-NEXT: ## kill: def %ymm0 killed %ymm0 def %zmm0 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $8, %k0, %k0 ; AVX512F-NEXT: kshiftrw $8, %k0, %k1 ; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} @@ -520,8 +505,7 @@ define void @test12(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %val) { ; ; SKX-LABEL: test12: ; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 +; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; SKX-NEXT: vmovdqu32 %ymm1, (%rdi) {%k1} ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -554,7 +538,7 @@ define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) { ; AVX512F-NEXT: ## kill: def %xmm1 killed %xmm1 def %zmm1 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k0 +; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 ; AVX512F-NEXT: vmovups %zmm1, (%rdi) {%k1} @@ -565,7 +549,7 @@ define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) { ; SKX: ## %bb.0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1 +; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1 ; SKX-NEXT: vmovups %xmm1, (%rdi) {%k1} ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer @@ -598,10 +582,10 @@ define void @test15(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) { ; AVX512F: ## %bb.0: ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k0 +; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -610,7 +594,7 @@ define void @test15(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) { ; SKX: ## %bb.0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1 +; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1 ; SKX-NEXT: vpmovqd %xmm1, (%rdi) {%k1} ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer @@ -644,7 +628,7 @@ define <2 x float> @test16(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> % ; AVX512F-NEXT: ## kill: def %xmm1 killed %xmm1 def %zmm1 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k0 +; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 ; AVX512F-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1} @@ -656,7 +640,7 @@ define <2 x float> @test16(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> % ; SKX: ## %bb.0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1 +; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1 ; SKX-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer @@ -693,10 +677,10 @@ define <2 x i32> @test17(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) { ; AVX512F: ## %bb.0: ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k0 +; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} ; AVX512F-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -706,7 +690,7 @@ define <2 x i32> @test17(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) { ; SKX: ## %bb.0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1 +; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1 ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; SKX-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} ; SKX-NEXT: vpmovsxdq %xmm0, %xmm0 @@ -739,7 +723,7 @@ define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) { ; AVX512F: ## %bb.0: ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512F-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 +; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 ; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} @@ -751,7 +735,7 @@ define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) { ; SKX: ## %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %k1 +; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1 ; SKX-NEXT: vmovups (%rdi), %xmm0 {%k1} {z} ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer diff --git a/llvm/test/CodeGen/X86/nontemporal-loads.ll b/llvm/test/CodeGen/X86/nontemporal-loads.ll index 308395d365c..37ff7115ac9 100644 --- a/llvm/test/CodeGen/X86/nontemporal-loads.ll +++ b/llvm/test/CodeGen/X86/nontemporal-loads.ll @@ -1900,8 +1900,7 @@ define <16 x i32> @test_masked_v16i32(i8 * %addr, <16 x i32> %old, <16 x i32> %m ; ; AVX512-LABEL: test_masked_v16i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; AVX512-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512-NEXT: vmovntdqa (%rdi), %zmm1 ; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr35272.ll b/llvm/test/CodeGen/X86/pr35272.ll index 0df1d7cb83c..0b832d56310 100644 --- a/llvm/test/CodeGen/X86/pr35272.ll +++ b/llvm/test/CodeGen/X86/pr35272.ll @@ -4,8 +4,7 @@ define <2 x i48> @PR35272(<2 x i64> %a0, <2 x i48> %a1, <2 x i48> %a2) { ; CHECK-LABEL: PR35272: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %xmm3, %xmm0, %k1 +; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpblendmq %xmm1, %xmm2, %xmm0 {%k1} ; CHECK-NEXT: retq %1 = icmp eq <2 x i64> %a0, zeroinitializer diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll b/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll index 00d3a5c67dc..2745055da99 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll @@ -7,8 +7,8 @@ define <8 x i16> @testv8i1_sext_v8i16(<8 x i32>* %p) { ; AVX256-LABEL: testv8i1_sext_v8i16: ; AVX256: # %bb.0: -; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 +; AVX256-NEXT: vmovdqa (%rdi), %ymm0 +; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX256-NEXT: vpmovdw %ymm0, %xmm0 @@ -17,8 +17,8 @@ define <8 x i16> @testv8i1_sext_v8i16(<8 x i32>* %p) { ; ; AVX512VL-LABEL: testv8i1_sext_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 @@ -42,9 +42,10 @@ define <8 x i16> @testv8i1_sext_v8i16(<8 x i32>* %p) { define <16 x i8> @testv16i1_sext_v16i8(<8 x i32>* %p, <8 x i32>* %q) { ; AVX256-LABEL: testv16i1_sext_v16i8: ; AVX256: # %bb.0: -; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 -; AVX256-NEXT: vpcmpeqd (%rsi), %ymm0, %k2 +; AVX256-NEXT: vmovdqa (%rdi), %ymm0 +; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1 +; AVX256-NEXT: vmovdqa (%rsi), %ymm0 +; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2 ; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z} ; AVX256-NEXT: vpmovdw %ymm1, %xmm1 @@ -58,9 +59,10 @@ define <16 x i8> @testv16i1_sext_v16i8(<8 x i32>* %p, <8 x i32>* %q) { ; ; AVX512VL-LABEL: testv16i1_sext_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k0 -; AVX512VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k1 +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0 +; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 @@ -70,10 +72,9 @@ define <16 x i8> @testv16i1_sext_v16i8(<8 x i32>* %p, <8 x i32>* %q) { ; AVX512F-LABEL: testv16i1_sext_v16i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512F-NEXT: kunpckbw %k0, %k1, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 @@ -91,9 +92,10 @@ define <16 x i8> @testv16i1_sext_v16i8(<8 x i32>* %p, <8 x i32>* %q) { define <16 x i16> @testv16i1_sext_v16i16(<8 x i32>* %p, <8 x i32>* %q) { ; AVX256-LABEL: testv16i1_sext_v16i16: ; AVX256: # %bb.0: -; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 -; AVX256-NEXT: vpcmpeqd (%rsi), %ymm0, %k2 +; AVX256-NEXT: vmovdqa (%rdi), %ymm0 +; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1 +; AVX256-NEXT: vmovdqa (%rsi), %ymm0 +; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2 ; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} ; AVX256-NEXT: vpmovdw %ymm1, %xmm1 @@ -104,9 +106,10 @@ define <16 x i16> @testv16i1_sext_v16i16(<8 x i32>* %p, <8 x i32>* %q) { ; ; AVX512VL-LABEL: testv16i1_sext_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k0 -; AVX512VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k1 +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0 +; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 @@ -115,10 +118,9 @@ define <16 x i16> @testv16i1_sext_v16i16(<8 x i32>* %p, <8 x i32>* %q) { ; AVX512F-LABEL: testv16i1_sext_v16i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512F-NEXT: kunpckbw %k0, %k1, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 @@ -135,8 +137,8 @@ define <16 x i16> @testv16i1_sext_v16i16(<8 x i32>* %p, <8 x i32>* %q) { define <8 x i16> @testv8i1_zext_v8i16(<8 x i32>* %p) { ; AVX256-LABEL: testv8i1_zext_v8i16: ; AVX256: # %bb.0: -; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 +; AVX256-NEXT: vmovdqa (%rdi), %ymm0 +; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; AVX256-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} ; AVX256-NEXT: vpmovdw %ymm0, %xmm0 ; AVX256-NEXT: vzeroupper @@ -144,8 +146,8 @@ define <8 x i16> @testv8i1_zext_v8i16(<8 x i32>* %p) { ; ; AVX512VL-LABEL: testv8i1_zext_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; AVX512VL-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper @@ -168,9 +170,10 @@ define <8 x i16> @testv8i1_zext_v8i16(<8 x i32>* %p) { define <16 x i8> @testv16i1_zext_v16i8(<8 x i32>* %p, <8 x i32>* %q) { ; AVX256-LABEL: testv16i1_zext_v16i8: ; AVX256: # %bb.0: -; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 -; AVX256-NEXT: vpcmpeqd (%rsi), %ymm0, %k2 +; AVX256-NEXT: vmovdqa (%rdi), %ymm0 +; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1 +; AVX256-NEXT: vmovdqa (%rsi), %ymm0 +; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2 ; AVX256-NEXT: movl {{.*}}(%rip), %eax ; AVX256-NEXT: vpbroadcastd %eax, %ymm0 {%k2} {z} ; AVX256-NEXT: vpmovdw %ymm0, %xmm0 @@ -185,9 +188,10 @@ define <16 x i8> @testv16i1_zext_v16i8(<8 x i32>* %p, <8 x i32>* %q) { ; ; AVX512VL-LABEL: testv16i1_zext_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k0 -; AVX512VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k1 +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0 +; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1 ; AVX512VL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 @@ -197,10 +201,9 @@ define <16 x i8> @testv16i1_zext_v16i8(<8 x i32>* %p, <8 x i32>* %q) { ; AVX512F-LABEL: testv16i1_zext_v16i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512F-NEXT: kunpckbw %k0, %k1, %k1 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 @@ -218,9 +221,10 @@ define <16 x i8> @testv16i1_zext_v16i8(<8 x i32>* %p, <8 x i32>* %q) { define <16 x i16> @testv16i1_zext_v16i16(<8 x i32>* %p, <8 x i32>* %q) { ; AVX256-LABEL: testv16i1_zext_v16i16: ; AVX256: # %bb.0: -; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 -; AVX256-NEXT: vpcmpeqd (%rsi), %ymm0, %k2 +; AVX256-NEXT: vmovdqa (%rdi), %ymm0 +; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1 +; AVX256-NEXT: vmovdqa (%rsi), %ymm0 +; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2 ; AVX256-NEXT: movl {{.*}}(%rip), %eax ; AVX256-NEXT: vpbroadcastd %eax, %ymm0 {%k1} {z} ; AVX256-NEXT: vpmovdw %ymm0, %xmm0 @@ -231,9 +235,10 @@ define <16 x i16> @testv16i1_zext_v16i16(<8 x i32>* %p, <8 x i32>* %q) { ; ; AVX512VL-LABEL: testv16i1_zext_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k0 -; AVX512VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k1 +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0 +; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1 ; AVX512VL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 @@ -242,10 +247,9 @@ define <16 x i16> @testv16i1_zext_v16i16(<8 x i32>* %p, <8 x i32>* %q) { ; AVX512F-LABEL: testv16i1_zext_v16i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512F-NEXT: kunpckbw %k0, %k1, %k1 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll index 39713666fcd..77d66bd9b08 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll @@ -11,9 +11,10 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<8 x i32>* %a, <8 x i32>* %b) { ; AVX256VL-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: ; AVX256VL: # %bb.0: -; AVX256VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX256VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 -; AVX256VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k2 +; AVX256VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX256VL-NEXT: vmovdqa (%rsi), %ymm1 +; AVX256VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 +; AVX256VL-NEXT: vptestnmd %ymm1, %ymm1, %k2 ; AVX256VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z} ; AVX256VL-NEXT: vpmovdw %ymm1, %xmm1 @@ -44,9 +45,10 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<8 x i32>* %a, <8 ; ; AVX512VL-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 -; AVX512VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k2 +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 +; AVX512VL-NEXT: vptestnmd %ymm1, %ymm1, %k2 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0] @@ -59,9 +61,10 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<8 x i32>* %a, <8 ; ; AVX256VLBW-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: ; AVX256VLBW: # %bb.0: -; AVX256VLBW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX256VLBW-NEXT: vpcmpeqd (%rdi), %ymm0, %k0 -; AVX256VLBW-NEXT: vpcmpeqd (%rsi), %ymm0, %k1 +; AVX256VLBW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX256VLBW-NEXT: vmovdqa (%rsi), %ymm1 +; AVX256VLBW-NEXT: vptestnmd %ymm0, %ymm0, %k0 +; AVX256VLBW-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; AVX256VLBW-NEXT: vpmovm2w %k1, %ymm0 ; AVX256VLBW-NEXT: vpmovm2w %k0, %ymm1 ; AVX256VLBW-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0] @@ -73,9 +76,10 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<8 x i32>* %a, <8 ; ; AVX512VLBW-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 -; AVX512VLBW-NEXT: vpcmpeqd (%rsi), %ymm0, %k2 +; AVX512VLBW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VLBW-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512VLBW-NEXT: vptestnmd %ymm0, %ymm0, %k1 +; AVX512VLBW-NEXT: vptestnmd %ymm1, %ymm1, %k2 ; AVX512VLBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512VLBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0] @@ -89,9 +93,8 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<8 x i32>* %a, <8 ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 -; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm1, %k2 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0] @@ -106,9 +109,8 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<8 x i32>* %a, <8 ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 -; AVX512BW-NEXT: vpcmpeqd %zmm2, %zmm1, %k2 +; AVX512BW-NEXT: vptestnmd %zmm0, %zmm0, %k1 +; AVX512BW-NEXT: vptestnmd %zmm1, %zmm1, %k2 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0] @@ -200,8 +202,7 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; ; AVX256VLBW-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: ; AVX256VLBW: # %bb.0: -; AVX256VLBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX256VLBW-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 +; AVX256VLBW-NEXT: vptestnmb %ymm0, %ymm0, %k0 ; AVX256VLBW-NEXT: vpmovm2b %k0, %ymm0 ; AVX256VLBW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX256VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u] @@ -214,8 +215,7 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; ; AVX512VLBW-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 +; AVX512VLBW-NEXT: vptestnmb %ymm0, %ymm0, %k0 ; AVX512VLBW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512VLBW-NEXT: vpermw %zmm0, %zmm1, %zmm0 @@ -226,8 +226,7 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; AVX512BW-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll index 2f62d92664a..10db0aeb25e 100644 --- a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll @@ -370,8 +370,7 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512BW-NEXT: vpandq %zmm1, %zmm4, %zmm1 ; AVX512BW-NEXT: vpshufb %zmm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-NEXT: vpcmpeqb %zmm2, %zmm0, %k0 +; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm0, %zmm1, %zmm0 @@ -457,8 +456,7 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; AVX512BW-NEXT: vpandq %zmm1, %zmm4, %zmm1 ; AVX512BW-NEXT: vpshufb %zmm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-NEXT: vpcmpeqb %zmm2, %zmm0, %k0 +; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm0, %zmm1, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll index 743fff3feee..6597925bab9 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -4591,8 +4591,7 @@ define <16 x i16> @PR34369(<16 x i16> %vec, <16 x i16> %mask) { ; AVX512VL-LABEL: PR34369: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,0,13,5,2,2,10,15,8,14,8,9,10,12,12] -; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 +; AVX512VL-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; AVX512VL-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} ; AVX512VL-NEXT: retq %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll index 7da1f2c9e08..66bd70ec81d 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll @@ -307,8 +307,7 @@ define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_ ; ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16: ; VL_BW_DQ: # %bb.0: -; VL_BW_DQ-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; VL_BW_DQ-NEXT: vpcmpeqw %zmm3, %zmm0, %k0 +; VL_BW_DQ-NEXT: vptestnmw %zmm0, %zmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm3, %zmm0 @@ -364,8 +363,7 @@ define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8: ; VL_BW_DQ: # %bb.0: -; VL_BW_DQ-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; VL_BW_DQ-NEXT: vpcmpeqb %ymm3, %ymm0, %k0 +; VL_BW_DQ-NEXT: vptestnmb %ymm0, %ymm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm3, %zmm0 @@ -381,9 +379,8 @@ define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split(<16 x i32> %a, <16 x i32> %b, <32 x i16> %c, <32 x i16> %d) { ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512F-NEXT: vpcmpeqd %zmm6, %zmm0, %k1 -; AVX512F-NEXT: vpcmpeqd %zmm6, %zmm1, %k2 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] @@ -397,9 +394,8 @@ define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_ ; ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512VL-NEXT: vpcmpeqd %zmm6, %zmm0, %k1 -; AVX512VL-NEXT: vpcmpeqd %zmm6, %zmm1, %k2 +; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1 +; AVX512VL-NEXT: vptestnmd %zmm1, %zmm1, %k2 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] @@ -413,9 +409,8 @@ define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_ ; ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split: ; VL_BW_DQ: # %bb.0: -; VL_BW_DQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; VL_BW_DQ-NEXT: vpcmpeqd %zmm4, %zmm0, %k0 -; VL_BW_DQ-NEXT: vpcmpeqd %zmm4, %zmm1, %k1 +; VL_BW_DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0 +; VL_BW_DQ-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; VL_BW_DQ-NEXT: kunpckwd %k0, %k1, %k0 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] @@ -434,9 +429,8 @@ define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_ define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split(<16 x i32> %a, <16 x i32> %b, <32 x i8> %c, <32 x i8> %d) { ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512F-NEXT: vpcmpeqd %zmm4, %zmm0, %k1 -; AVX512F-NEXT: vpcmpeqd %zmm4, %zmm1, %k2 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] @@ -450,9 +444,8 @@ define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VL-NEXT: vpcmpeqd %zmm4, %zmm0, %k1 -; AVX512VL-NEXT: vpcmpeqd %zmm4, %zmm1, %k2 +; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1 +; AVX512VL-NEXT: vptestnmd %zmm1, %zmm1, %k2 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] @@ -466,9 +459,8 @@ define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split: ; VL_BW_DQ: # %bb.0: -; VL_BW_DQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; VL_BW_DQ-NEXT: vpcmpeqd %zmm4, %zmm0, %k0 -; VL_BW_DQ-NEXT: vpcmpeqd %zmm4, %zmm1, %k1 +; VL_BW_DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0 +; VL_BW_DQ-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; VL_BW_DQ-NEXT: kunpckwd %k0, %k1, %k0 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] |