diff options
author | Craig Topper <craig.topper@intel.com> | 2018-02-02 05:59:31 +0000 |
---|---|---|
committer | Craig Topper <craig.topper@intel.com> | 2018-02-02 05:59:31 +0000 |
commit | 5570e03b21e9304e51f63d75e9028c100a1b4c6a (patch) | |
tree | c8b6616b2093cdb692d667854a0d18e16582d966 /llvm | |
parent | b22c1d29bc7fd6ac3f389ac313ef4b80e493d966 (diff) | |
download | bcm5719-llvm-5570e03b21e9304e51f63d75e9028c100a1b4c6a.tar.gz bcm5719-llvm-5570e03b21e9304e51f63d75e9028c100a1b4c6a.zip |
[X86] Legalize (i64 (bitcast (v64i1 X))) on 32-bit targets by extracting to v32i1 and bitcasting to i32.
This saves a trip through memory and seems to open up other combining opportunities.
llvm-svn: 324056
Diffstat (limited to 'llvm')
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 17 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx512-regcall-Mask.ll | 62 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll | 91 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll | 808 |
4 files changed, 488 insertions, 490 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 927cc35ba6e..a55770ae2f1 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -24953,6 +24953,23 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, EVT DstVT = N->getValueType(0); EVT SrcVT = N->getOperand(0).getValueType(); + // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target + // we can split using the k-register rather than memory. + if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) { + assert(!Subtarget.is64Bit() && "Expected 32-bit mode"); + SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v32i1, + N->getOperand(0), + DAG.getIntPtrConstant(0, dl)); + Lo = DAG.getBitcast(MVT::i32, Lo); + SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v32i1, + N->getOperand(0), + DAG.getIntPtrConstant(32, dl)); + Hi = DAG.getBitcast(MVT::i32, Hi); + SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); + Results.push_back(Res); + return; + } + if (SrcVT != MVT::f64 || (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8)) return; diff --git a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll index b4aa3efd78e..199f1cbf03f 100644 --- a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll +++ b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll @@ -7,46 +7,30 @@ define x86_regcallcc i64 @test_argv64i1(<64 x i1> %x0, <64 x i1> %x1, <64 x i1> %x2, <64 x i1> %x3, <64 x i1> %x4, <64 x i1> %x5, <64 x i1> %x6, <64 x i1> %x7, <64 x i1> %x8, <64 x i1> %x9, <64 x i1> %x10, <64 x i1> %x11, <64 x i1> %x12) { ; X32-LABEL: test_argv64i1: ; X32: # %bb.0: -; X32-NEXT: pushl %ebp -; X32-NEXT: movl %esp, %ebp -; X32-NEXT: andl $-8, %esp -; X32-NEXT: subl $16, %esp -; X32-NEXT: kmovd %edx, %k0 -; X32-NEXT: kmovd %edi, %k1 -; X32-NEXT: kunpckdq %k0, %k1, %k0 -; X32-NEXT: kmovd %eax, %k1 -; X32-NEXT: kmovd %ecx, %k2 -; X32-NEXT: kunpckdq %k1, %k2, %k1 -; X32-NEXT: kmovq %k1, {{[0-9]+}}(%esp) -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: kmovq %k0, (%esp) -; X32-NEXT: addl (%esp), %eax +; X32-NEXT: addl %edx, %eax +; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: addl {{[0-9]+}}(%esp), %eax +; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: addl {{[0-9]+}}(%esp), %eax +; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: addl {{[0-9]+}}(%esp), %eax +; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: addl {{[0-9]+}}(%esp), %eax +; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: addl {{[0-9]+}}(%esp), %eax +; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: addl {{[0-9]+}}(%esp), %eax +; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: addl {{[0-9]+}}(%esp), %eax +; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: addl {{[0-9]+}}(%esp), %eax +; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: addl {{[0-9]+}}(%esp), %eax +; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: addl {{[0-9]+}}(%esp), %eax +; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: addl {{[0-9]+}}(%esp), %eax ; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: addl 8(%ebp), %eax -; X32-NEXT: adcl 12(%ebp), %ecx -; X32-NEXT: addl 16(%ebp), %eax -; X32-NEXT: adcl 20(%ebp), %ecx -; X32-NEXT: addl 24(%ebp), %eax -; X32-NEXT: adcl 28(%ebp), %ecx -; X32-NEXT: addl 32(%ebp), %eax -; X32-NEXT: adcl 36(%ebp), %ecx -; X32-NEXT: addl 40(%ebp), %eax -; X32-NEXT: adcl 44(%ebp), %ecx -; X32-NEXT: addl 48(%ebp), %eax -; X32-NEXT: adcl 52(%ebp), %ecx -; X32-NEXT: addl 56(%ebp), %eax -; X32-NEXT: adcl 60(%ebp), %ecx -; X32-NEXT: addl 64(%ebp), %eax -; X32-NEXT: adcl 68(%ebp), %ecx -; X32-NEXT: addl 72(%ebp), %eax -; X32-NEXT: adcl 76(%ebp), %ecx -; X32-NEXT: addl 80(%ebp), %eax -; X32-NEXT: adcl 84(%ebp), %ecx -; X32-NEXT: addl 88(%ebp), %eax -; X32-NEXT: adcl 92(%ebp), %ecx -; X32-NEXT: movl %ebp, %esp -; X32-NEXT: popl %ebp ; X32-NEXT: retl ; ; WIN64-LABEL: test_argv64i1: diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll index 6ef64592884..1315fb474bb 100644 --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll @@ -17,11 +17,12 @@ define i64 @test_mm512_kunpackd(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, ; X32-NEXT: vmovdqa64 136(%ebp), %zmm3 ; X32-NEXT: vpcmpneqb %zmm0, %zmm1, %k0 ; X32-NEXT: vpcmpneqb 8(%ebp), %zmm2, %k1 -; X32-NEXT: kunpckdq %k0, %k1, %k1 -; X32-NEXT: vpcmpneqb 72(%ebp), %zmm3, %k0 {%k1} -; X32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: vpcmpneqb 72(%ebp), %zmm3, %k2 +; X32-NEXT: kandd %k0, %k2, %k0 +; X32-NEXT: kmovd %k0, %eax +; X32-NEXT: kshiftrq $32, %k2, %k0 +; X32-NEXT: kandd %k1, %k0, %k0 +; X32-NEXT: kmovd %k0, %edx ; X32-NEXT: movl %ebp, %esp ; X32-NEXT: popl %ebp ; X32-NEXT: vzeroupper @@ -1647,19 +1648,10 @@ define <8 x i64> @test_mm512_maskz_unpacklo_epi16(i32 %a0, <8 x i64> %a1, <8 x i define i64 @test_mm512_test_epi8_mask(<8 x i64> %__A, <8 x i64> %__B) { ; X32-LABEL: test_mm512_test_epi8_mask: ; X32: # %bb.0: # %entry -; X32-NEXT: pushl %ebp -; X32-NEXT: .cfi_def_cfa_offset 8 -; X32-NEXT: .cfi_offset %ebp, -8 -; X32-NEXT: movl %esp, %ebp -; X32-NEXT: .cfi_def_cfa_register %ebp -; X32-NEXT: andl $-8, %esp -; X32-NEXT: subl $8, %esp ; X32-NEXT: vptestmb %zmm0, %zmm1, %k0 -; X32-NEXT: kmovq %k0, (%esp) -; X32-NEXT: movl (%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl %ebp, %esp -; X32-NEXT: popl %ebp +; X32-NEXT: kshiftrq $32, %k0, %k1 +; X32-NEXT: kmovd %k0, %eax +; X32-NEXT: kmovd %k1, %edx ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; @@ -1680,18 +1672,13 @@ entry: define i64 @test_mm512_mask_test_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) { ; X32-LABEL: test_mm512_mask_test_epi8_mask: ; X32: # %bb.0: # %entry -; X32-NEXT: pushl %ebp -; X32-NEXT: .cfi_def_cfa_offset 8 -; X32-NEXT: .cfi_offset %ebp, -8 -; X32-NEXT: movl %esp, %ebp -; X32-NEXT: .cfi_def_cfa_register %ebp ; X32-NEXT: pushl %ebx +; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: pushl %esi -; X32-NEXT: andl $-8, %esp -; X32-NEXT: subl $8, %esp -; X32-NEXT: .cfi_offset %esi, -16 -; X32-NEXT: .cfi_offset %ebx, -12 -; X32-NEXT: movl 8(%ebp), %eax +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %ebx, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovd %eax, %k0 ; X32-NEXT: kshiftrq $1, %k0, %k1 ; X32-NEXT: movl %eax, %ecx @@ -1798,7 +1785,7 @@ define i64 @test_mm512_mask_test_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> % ; X32-NEXT: movl %ecx, %ebx ; X32-NEXT: shrb $2, %bl ; X32-NEXT: kmovd %ebx, %k7 -; X32-NEXT: movl 12(%ebp), %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X32-NEXT: kshiftlq $63, %k1, %k1 ; X32-NEXT: kshiftrq $53, %k1, %k1 ; X32-NEXT: kxorq %k1, %k0, %k0 @@ -2211,13 +2198,11 @@ define i64 @test_mm512_mask_test_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> % ; X32-NEXT: kshiftlq $63, %k1, %k1 ; X32-NEXT: korq %k1, %k0, %k1 ; X32-NEXT: vptestmb %zmm0, %zmm1, %k0 {%k1} -; X32-NEXT: kmovq %k0, (%esp) -; X32-NEXT: movl (%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: leal -8(%ebp), %esp +; X32-NEXT: kshiftrq $32, %k0, %k1 +; X32-NEXT: kmovd %k0, %eax +; X32-NEXT: kmovd %k1, %edx ; X32-NEXT: popl %esi ; X32-NEXT: popl %ebx -; X32-NEXT: popl %ebp ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; @@ -2289,19 +2274,10 @@ entry: define i64 @test_mm512_testn_epi8_mask(<8 x i64> %__A, <8 x i64> %__B) { ; X32-LABEL: test_mm512_testn_epi8_mask: ; X32: # %bb.0: # %entry -; X32-NEXT: pushl %ebp -; X32-NEXT: .cfi_def_cfa_offset 8 -; X32-NEXT: .cfi_offset %ebp, -8 -; X32-NEXT: movl %esp, %ebp -; X32-NEXT: .cfi_def_cfa_register %ebp -; X32-NEXT: andl $-8, %esp -; X32-NEXT: subl $8, %esp ; X32-NEXT: vptestnmb %zmm0, %zmm1, %k0 -; X32-NEXT: kmovq %k0, (%esp) -; X32-NEXT: movl (%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl %ebp, %esp -; X32-NEXT: popl %ebp +; X32-NEXT: kshiftrq $32, %k0, %k1 +; X32-NEXT: kmovd %k0, %eax +; X32-NEXT: kmovd %k1, %edx ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; @@ -2322,18 +2298,13 @@ entry: define i64 @test_mm512_mask_testn_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) { ; X32-LABEL: test_mm512_mask_testn_epi8_mask: ; X32: # %bb.0: # %entry -; X32-NEXT: pushl %ebp -; X32-NEXT: .cfi_def_cfa_offset 8 -; X32-NEXT: .cfi_offset %ebp, -8 -; X32-NEXT: movl %esp, %ebp -; X32-NEXT: .cfi_def_cfa_register %ebp ; X32-NEXT: pushl %ebx +; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: pushl %esi -; X32-NEXT: andl $-8, %esp -; X32-NEXT: subl $8, %esp -; X32-NEXT: .cfi_offset %esi, -16 -; X32-NEXT: .cfi_offset %ebx, -12 -; X32-NEXT: movl 8(%ebp), %eax +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %ebx, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovd %eax, %k0 ; X32-NEXT: kshiftrq $1, %k0, %k1 ; X32-NEXT: movl %eax, %ecx @@ -2440,7 +2411,7 @@ define i64 @test_mm512_mask_testn_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> ; X32-NEXT: movl %ecx, %ebx ; X32-NEXT: shrb $2, %bl ; X32-NEXT: kmovd %ebx, %k7 -; X32-NEXT: movl 12(%ebp), %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X32-NEXT: kshiftlq $63, %k1, %k1 ; X32-NEXT: kshiftrq $53, %k1, %k1 ; X32-NEXT: kxorq %k1, %k0, %k0 @@ -2853,13 +2824,11 @@ define i64 @test_mm512_mask_testn_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> ; X32-NEXT: kshiftlq $63, %k1, %k1 ; X32-NEXT: korq %k1, %k0, %k1 ; X32-NEXT: vptestnmb %zmm0, %zmm1, %k0 {%k1} -; X32-NEXT: kmovq %k0, (%esp) -; X32-NEXT: movl (%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: leal -8(%ebp), %esp +; X32-NEXT: kshiftrq $32, %k0, %k1 +; X32-NEXT: kmovd %k0, %eax +; X32-NEXT: kmovd %k1, %edx ; X32-NEXT: popl %esi ; X32-NEXT: popl %ebx -; X32-NEXT: popl %ebp ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll index 1d0be23316e..8fad90b214b 100644 --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -37,15 +37,8 @@ define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) { ; ; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_qd: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: subl $12, %esp -; AVX512F-32-NEXT: .cfi_def_cfa_offset 16 -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0 -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k0 -; AVX512F-32-NEXT: kmovq %k0, (%esp) -; AVX512F-32-NEXT: movl (%esp), %eax +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: addl $12, %esp ; AVX512F-32-NEXT: retl %res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1) ret i64 %res @@ -396,13 +389,10 @@ define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) { ; ; AVX512F-32-LABEL: test_pcmpeq_b: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: subl $12, %esp -; AVX512F-32-NEXT: .cfi_def_cfa_offset 16 ; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 -; AVX512F-32-NEXT: kmovq %k0, (%esp) -; AVX512F-32-NEXT: movl (%esp), %eax -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: addl $12, %esp +; AVX512F-32-NEXT: kshiftrq $32, %k0, %k1 +; AVX512F-32-NEXT: kmovd %k0, %eax +; AVX512F-32-NEXT: kmovd %k1, %edx ; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1) @@ -420,14 +410,11 @@ define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) { ; ; AVX512F-32-LABEL: test_mask_pcmpeq_b: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: subl $12, %esp -; AVX512F-32-NEXT: .cfi_def_cfa_offset 16 ; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} -; AVX512F-32-NEXT: kmovq %k0, (%esp) -; AVX512F-32-NEXT: movl (%esp), %eax -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: addl $12, %esp +; AVX512F-32-NEXT: kshiftrq $32, %k0, %k1 +; AVX512F-32-NEXT: kmovd %k0, %eax +; AVX512F-32-NEXT: kmovd %k1, %edx ; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask) @@ -486,13 +473,10 @@ define i64 @test_pcmpgt_b(<64 x i8> %a, <64 x i8> %b) { ; ; AVX512F-32-LABEL: test_pcmpgt_b: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: subl $12, %esp -; AVX512F-32-NEXT: .cfi_def_cfa_offset 16 ; AVX512F-32-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 -; AVX512F-32-NEXT: kmovq %k0, (%esp) -; AVX512F-32-NEXT: movl (%esp), %eax -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: addl $12, %esp +; AVX512F-32-NEXT: kshiftrq $32, %k0, %k1 +; AVX512F-32-NEXT: kmovd %k0, %eax +; AVX512F-32-NEXT: kmovd %k1, %edx ; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1) @@ -510,14 +494,11 @@ define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) { ; ; AVX512F-32-LABEL: test_mask_pcmpgt_b: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: subl $12, %esp -; AVX512F-32-NEXT: .cfi_def_cfa_offset 16 ; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} -; AVX512F-32-NEXT: kmovq %k0, (%esp) -; AVX512F-32-NEXT: movl (%esp), %eax -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: addl $12, %esp +; AVX512F-32-NEXT: kshiftrq $32, %k0, %k1 +; AVX512F-32-NEXT: kmovd %k0, %eax +; AVX512F-32-NEXT: kmovd %k1, %edx ; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask) @@ -1719,37 +1700,52 @@ define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) { ; ; AVX512F-32-LABEL: test_cmp_b_512: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: subl $60, %esp -; AVX512F-32-NEXT: .cfi_def_cfa_offset 64 +; AVX512F-32-NEXT: pushl %edi +; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512F-32-NEXT: pushl %esi +; AVX512F-32-NEXT: .cfi_def_cfa_offset 12 +; AVX512F-32-NEXT: .cfi_offset %esi, -12 +; AVX512F-32-NEXT: .cfi_offset %edi, -8 ; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: kshiftrq $32, %k0, %k1 +; AVX512F-32-NEXT: kmovd %k1, %eax +; AVX512F-32-NEXT: kmovd %k0, %ecx ; AVX512F-32-NEXT: vpcmpgtb %zmm0, %zmm1, %k0 -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: kshiftrq $32, %k0, %k1 +; AVX512F-32-NEXT: kmovd %k1, %edx +; AVX512F-32-NEXT: kmovd %k0, %esi +; AVX512F-32-NEXT: addl %ecx, %esi +; AVX512F-32-NEXT: adcl %eax, %edx ; AVX512F-32-NEXT: vpcmpleb %zmm1, %zmm0, %k0 -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: kshiftrq $32, %k0, %k1 +; AVX512F-32-NEXT: kmovd %k1, %eax +; AVX512F-32-NEXT: kmovd %k0, %ecx +; AVX512F-32-NEXT: addl %esi, %ecx +; AVX512F-32-NEXT: adcl %edx, %eax ; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: kshiftrq $32, %k0, %k1 +; AVX512F-32-NEXT: kmovd %k1, %edx +; AVX512F-32-NEXT: kmovd %k0, %esi +; AVX512F-32-NEXT: addl %ecx, %esi +; AVX512F-32-NEXT: adcl %eax, %edx ; AVX512F-32-NEXT: vpcmpleb %zmm0, %zmm1, %k0 -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: kshiftrq $32, %k0, %k1 +; AVX512F-32-NEXT: kmovd %k1, %ecx +; AVX512F-32-NEXT: kmovd %k0, %edi +; AVX512F-32-NEXT: addl %esi, %edi +; AVX512F-32-NEXT: adcl %edx, %ecx ; AVX512F-32-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 -; AVX512F-32-NEXT: kmovq %k0, (%esp) -; AVX512F-32-NEXT: addl (%esp), %eax -; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: kxnorq %k0, %k0, %k0 -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: addl $60, %esp +; AVX512F-32-NEXT: kshiftrq $32, %k0, %k1 +; AVX512F-32-NEXT: kmovd %k1, %edx +; AVX512F-32-NEXT: kmovd %k0, %eax +; AVX512F-32-NEXT: addl %edi, %eax +; AVX512F-32-NEXT: adcl %ecx, %edx +; AVX512F-32-NEXT: kxnord %k0, %k0, %k0 +; AVX512F-32-NEXT: kmovd %k0, %ecx +; AVX512F-32-NEXT: addl %ecx, %eax +; AVX512F-32-NEXT: adcl %ecx, %edx +; AVX512F-32-NEXT: popl %esi +; AVX512F-32-NEXT: popl %edi ; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1) @@ -1805,41 +1801,40 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: .cfi_def_cfa_offset 16 ; AVX512F-32-NEXT: pushl %esi ; AVX512F-32-NEXT: .cfi_def_cfa_offset 20 -; AVX512F-32-NEXT: subl $60, %esp -; AVX512F-32-NEXT: .cfi_def_cfa_offset 80 ; AVX512F-32-NEXT: .cfi_offset %esi, -20 ; AVX512F-32-NEXT: .cfi_offset %edi, -16 ; AVX512F-32-NEXT: .cfi_offset %ebx, -12 ; AVX512F-32-NEXT: .cfi_offset %ebp, -8 -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; AVX512F-32-NEXT: movl %ebx, %eax -; AVX512F-32-NEXT: shrl $16, %eax -; AVX512F-32-NEXT: movl %ebx, %edx -; AVX512F-32-NEXT: andb $15, %dl -; AVX512F-32-NEXT: movl %ebx, %ecx -; AVX512F-32-NEXT: andb $2, %cl -; AVX512F-32-NEXT: shrb %cl -; AVX512F-32-NEXT: kmovd %ecx, %k1 -; AVX512F-32-NEXT: movl %edx, %ecx -; AVX512F-32-NEXT: shrb $2, %dl -; AVX512F-32-NEXT: kmovd %edx, %k2 -; AVX512F-32-NEXT: movb %bh, %dl +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: movl %eax, %ecx +; AVX512F-32-NEXT: shrl $16, %ecx +; AVX512F-32-NEXT: movl %ecx, %esi +; AVX512F-32-NEXT: movl %eax, %ecx +; AVX512F-32-NEXT: andb $15, %cl +; AVX512F-32-NEXT: movl %eax, %edx +; AVX512F-32-NEXT: andb $2, %dl +; AVX512F-32-NEXT: shrb %dl +; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: movl %ecx, %ebx +; AVX512F-32-NEXT: shrb $2, %cl +; AVX512F-32-NEXT: kmovd %ecx, %k2 +; AVX512F-32-NEXT: movb %ah, %dl ; AVX512F-32-NEXT: andb $15, %dl -; AVX512F-32-NEXT: shrb $3, %cl -; AVX512F-32-NEXT: kmovd %ecx, %k0 -; AVX512F-32-NEXT: movl %ebx, %ecx +; AVX512F-32-NEXT: shrb $3, %bl +; AVX512F-32-NEXT: kmovd %ebx, %k0 +; AVX512F-32-NEXT: movl %eax, %ecx ; AVX512F-32-NEXT: shrb $4, %cl ; AVX512F-32-NEXT: kmovd %ecx, %k3 -; AVX512F-32-NEXT: movl %ebx, %ecx +; AVX512F-32-NEXT: movl %eax, %ecx ; AVX512F-32-NEXT: shrb $5, %cl ; AVX512F-32-NEXT: andb $1, %cl ; AVX512F-32-NEXT: kmovd %ecx, %k4 -; AVX512F-32-NEXT: movl %ebx, %ecx +; AVX512F-32-NEXT: movl %eax, %ecx ; AVX512F-32-NEXT: shrb $6, %cl ; AVX512F-32-NEXT: kmovd %ecx, %k6 -; AVX512F-32-NEXT: movl %ebx, %ecx +; AVX512F-32-NEXT: movl %eax, %ecx ; AVX512F-32-NEXT: shrb $7, %cl -; AVX512F-32-NEXT: kmovd %ebx, %k5 +; AVX512F-32-NEXT: kmovd %eax, %k5 ; AVX512F-32-NEXT: kshiftrq $1, %k5, %k7 ; AVX512F-32-NEXT: kxorq %k1, %k7, %k1 ; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1 @@ -1848,9 +1843,9 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: kshiftrq $2, %k7, %k1 ; AVX512F-32-NEXT: kxorq %k2, %k1, %k2 ; AVX512F-32-NEXT: kmovd %ecx, %k5 -; AVX512F-32-NEXT: movb %bh, %cl +; AVX512F-32-NEXT: movb %ah, %cl ; AVX512F-32-NEXT: kmovd %ecx, %k1 -; AVX512F-32-NEXT: movl %ebx, %ebp +; AVX512F-32-NEXT: movl %eax, %ebp ; AVX512F-32-NEXT: andb $2, %cl ; AVX512F-32-NEXT: shrb %cl ; AVX512F-32-NEXT: kshiftlq $63, %k2, %k2 @@ -1867,6 +1862,7 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: kshiftrq $4, %k0, %k7 ; AVX512F-32-NEXT: kxorq %k3, %k7, %k7 ; AVX512F-32-NEXT: kmovd %edx, %k3 +; AVX512F-32-NEXT: movl %esi, %eax ; AVX512F-32-NEXT: movl %eax, %edx ; AVX512F-32-NEXT: andb $15, %dl ; AVX512F-32-NEXT: shrb $3, %cl @@ -1876,7 +1872,7 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: kshiftrq $5, %k7, %k0 ; AVX512F-32-NEXT: kxorq %k4, %k0, %k4 ; AVX512F-32-NEXT: kmovd %ecx, %k0 -; AVX512F-32-NEXT: movl %ebx, %ecx +; AVX512F-32-NEXT: movl %ebp, %ecx ; AVX512F-32-NEXT: shrl $13, %ecx ; AVX512F-32-NEXT: andb $1, %cl ; AVX512F-32-NEXT: kshiftlq $63, %k4, %k4 @@ -1937,8 +1933,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: kmovd %edx, %k7 ; AVX512F-32-NEXT: kxorq %k7, %k0, %k7 ; AVX512F-32-NEXT: kmovd %ecx, %k0 -; AVX512F-32-NEXT: movl %eax, %edx -; AVX512F-32-NEXT: shrb $6, %dl +; AVX512F-32-NEXT: movl %eax, %ecx +; AVX512F-32-NEXT: shrb $6, %cl ; AVX512F-32-NEXT: shrl $15, %esi ; AVX512F-32-NEXT: shrl $14, %edi ; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7 @@ -1964,9 +1960,9 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: kshiftrq $16, %k3, %k4 ; AVX512F-32-NEXT: kmovd %eax, %k7 ; AVX512F-32-NEXT: kxorq %k7, %k4, %k4 -; AVX512F-32-NEXT: kmovd %edx, %k7 -; AVX512F-32-NEXT: movl %ebp, %edx -; AVX512F-32-NEXT: shrl $24, %edx +; AVX512F-32-NEXT: kmovd %ecx, %k7 +; AVX512F-32-NEXT: movl %ebp, %ecx +; AVX512F-32-NEXT: shrl $24, %ecx ; AVX512F-32-NEXT: # kill: def $al killed $al killed $eax def $eax ; AVX512F-32-NEXT: shrb $7, %al ; AVX512F-32-NEXT: kshiftlq $63, %k4, %k4 @@ -1975,15 +1971,15 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: kshiftrq $17, %k3, %k4 ; AVX512F-32-NEXT: kxorq %k5, %k4, %k4 ; AVX512F-32-NEXT: kmovd %eax, %k5 -; AVX512F-32-NEXT: movl %edx, %eax +; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: kshiftlq $63, %k4, %k4 ; AVX512F-32-NEXT: kshiftrq $46, %k4, %k4 ; AVX512F-32-NEXT: kxorq %k4, %k3, %k4 ; AVX512F-32-NEXT: kshiftrq $18, %k4, %k3 ; AVX512F-32-NEXT: kxorq %k6, %k3, %k6 -; AVX512F-32-NEXT: kmovd %edx, %k3 -; AVX512F-32-NEXT: # kill: def $dl killed $dl killed $edx def $edx -; AVX512F-32-NEXT: andb $15, %dl +; AVX512F-32-NEXT: kmovd %ecx, %k3 +; AVX512F-32-NEXT: # kill: def $cl killed $cl killed $ecx def $ecx +; AVX512F-32-NEXT: andb $15, %cl ; AVX512F-32-NEXT: andb $2, %al ; AVX512F-32-NEXT: shrb %al ; AVX512F-32-NEXT: kshiftlq $63, %k6, %k6 @@ -1992,23 +1988,23 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: kshiftrq $19, %k6, %k4 ; AVX512F-32-NEXT: kxorq %k1, %k4, %k1 ; AVX512F-32-NEXT: kmovd %eax, %k4 -; AVX512F-32-NEXT: movl %edx, %ecx -; AVX512F-32-NEXT: shrb $2, %dl +; AVX512F-32-NEXT: movl %ecx, %edx +; AVX512F-32-NEXT: shrb $2, %cl ; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1 ; AVX512F-32-NEXT: kshiftrq $44, %k1, %k1 ; AVX512F-32-NEXT: kxorq %k1, %k6, %k1 ; AVX512F-32-NEXT: kshiftrq $20, %k1, %k6 ; AVX512F-32-NEXT: kxorq %k2, %k6, %k6 -; AVX512F-32-NEXT: kmovd %edx, %k2 +; AVX512F-32-NEXT: kmovd %ecx, %k2 ; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: andb $15, %al -; AVX512F-32-NEXT: shrb $3, %cl +; AVX512F-32-NEXT: shrb $3, %dl ; AVX512F-32-NEXT: kshiftlq $63, %k6, %k6 ; AVX512F-32-NEXT: kshiftrq $43, %k6, %k6 ; AVX512F-32-NEXT: kxorq %k6, %k1, %k6 ; AVX512F-32-NEXT: kshiftrq $21, %k6, %k1 ; AVX512F-32-NEXT: kxorq %k0, %k1, %k0 -; AVX512F-32-NEXT: kmovd %ecx, %k1 +; AVX512F-32-NEXT: kmovd %edx, %k1 ; AVX512F-32-NEXT: movl %ebp, %ecx ; AVX512F-32-NEXT: shrl $29, %ecx ; AVX512F-32-NEXT: andb $1, %cl @@ -2018,15 +2014,15 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: kshiftrq $22, %k6, %k0 ; AVX512F-32-NEXT: kxorq %k7, %k0, %k7 ; AVX512F-32-NEXT: kmovd %ecx, %k0 -; AVX512F-32-NEXT: movl %ebx, %edx -; AVX512F-32-NEXT: andb $2, %dl -; AVX512F-32-NEXT: shrb %dl +; AVX512F-32-NEXT: movl %ebx, %ecx +; AVX512F-32-NEXT: andb $2, %cl +; AVX512F-32-NEXT: shrb %cl ; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7 ; AVX512F-32-NEXT: kshiftrq $41, %k7, %k7 ; AVX512F-32-NEXT: kxorq %k7, %k6, %k6 ; AVX512F-32-NEXT: kshiftrq $23, %k6, %k7 ; AVX512F-32-NEXT: kxorq %k5, %k7, %k7 -; AVX512F-32-NEXT: kmovd %edx, %k5 +; AVX512F-32-NEXT: kmovd %ecx, %k5 ; AVX512F-32-NEXT: movl %eax, %ecx ; AVX512F-32-NEXT: shrb $2, %al ; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7 @@ -2069,10 +2065,10 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: kmovd %edx, %k7 ; AVX512F-32-NEXT: kxorq %k7, %k4, %k7 ; AVX512F-32-NEXT: kmovd %ecx, %k4 -; AVX512F-32-NEXT: movl %ebx, %edx -; AVX512F-32-NEXT: shrb $6, %dl -; AVX512F-32-NEXT: movl %ebp, %ecx -; AVX512F-32-NEXT: shrl $31, %ecx +; AVX512F-32-NEXT: movl %ebx, %ecx +; AVX512F-32-NEXT: shrb $6, %cl +; AVX512F-32-NEXT: movl %ebp, %edx +; AVX512F-32-NEXT: shrl $31, %edx ; AVX512F-32-NEXT: movl %ebp, %esi ; AVX512F-32-NEXT: shrl $30, %esi ; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7 @@ -2090,7 +2086,7 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: kshiftrq $33, %k1, %k1 ; AVX512F-32-NEXT: kxorq %k1, %k0, %k0 ; AVX512F-32-NEXT: kshiftrq $31, %k0, %k1 -; AVX512F-32-NEXT: kmovd %ecx, %k7 +; AVX512F-32-NEXT: kmovd %edx, %k7 ; AVX512F-32-NEXT: kxorq %k7, %k1, %k1 ; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1 ; AVX512F-32-NEXT: kshiftrq $32, %k1, %k1 @@ -2098,7 +2094,7 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: kshiftrq $32, %k0, %k1 ; AVX512F-32-NEXT: kmovd %ebx, %k7 ; AVX512F-32-NEXT: kxorq %k7, %k1, %k1 -; AVX512F-32-NEXT: kmovd %edx, %k7 +; AVX512F-32-NEXT: kmovd %ecx, %k7 ; AVX512F-32-NEXT: movl %ebx, %ecx ; AVX512F-32-NEXT: shrb $7, %cl ; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1 @@ -2137,119 +2133,117 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: kshiftrq $27, %k2, %k2 ; AVX512F-32-NEXT: kxorq %k2, %k5, %k2 ; AVX512F-32-NEXT: kshiftrq $37, %k2, %k5 -; AVX512F-32-NEXT: kxorq %k4, %k5, %k5 -; AVX512F-32-NEXT: kmovd %ecx, %k4 +; AVX512F-32-NEXT: kxorq %k4, %k5, %k4 +; AVX512F-32-NEXT: kmovd %ecx, %k5 ; AVX512F-32-NEXT: movl %ebx, %ecx ; AVX512F-32-NEXT: shrl $13, %ecx ; AVX512F-32-NEXT: andb $1, %cl -; AVX512F-32-NEXT: kshiftlq $63, %k5, %k5 -; AVX512F-32-NEXT: kshiftrq $26, %k5, %k5 -; AVX512F-32-NEXT: kxorq %k5, %k2, %k2 -; AVX512F-32-NEXT: kshiftrq $38, %k2, %k5 -; AVX512F-32-NEXT: kxorq %k7, %k5, %k7 -; AVX512F-32-NEXT: kmovd %ecx, %k5 -; AVX512F-32-NEXT: movl %eax, %edx -; AVX512F-32-NEXT: andb $2, %dl -; AVX512F-32-NEXT: shrb %dl +; AVX512F-32-NEXT: kshiftlq $63, %k4, %k4 +; AVX512F-32-NEXT: kshiftrq $26, %k4, %k4 +; AVX512F-32-NEXT: kxorq %k4, %k2, %k2 +; AVX512F-32-NEXT: kshiftrq $38, %k2, %k4 +; AVX512F-32-NEXT: kxorq %k7, %k4, %k7 +; AVX512F-32-NEXT: kmovd %ecx, %k4 +; AVX512F-32-NEXT: movl %eax, %ecx +; AVX512F-32-NEXT: andb $2, %cl +; AVX512F-32-NEXT: shrb %cl ; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7 ; AVX512F-32-NEXT: kshiftrq $25, %k7, %k7 ; AVX512F-32-NEXT: kxorq %k7, %k2, %k7 ; AVX512F-32-NEXT: kshiftrq $39, %k7, %k2 ; AVX512F-32-NEXT: kxorq %k6, %k2, %k6 -; AVX512F-32-NEXT: kmovd %edx, %k2 -; AVX512F-32-NEXT: movl %eax, %ecx -; AVX512F-32-NEXT: andb $15, %cl -; AVX512F-32-NEXT: movl %ecx, %edx -; AVX512F-32-NEXT: shrb $2, %cl +; AVX512F-32-NEXT: kmovd %ecx, %k2 +; AVX512F-32-NEXT: movl %eax, %edx +; AVX512F-32-NEXT: andb $15, %dl +; AVX512F-32-NEXT: movl %edx, %ecx +; AVX512F-32-NEXT: shrb $2, %dl ; AVX512F-32-NEXT: kshiftlq $63, %k6, %k6 ; AVX512F-32-NEXT: kshiftrq $24, %k6, %k6 ; AVX512F-32-NEXT: kxorq %k6, %k7, %k6 ; AVX512F-32-NEXT: kshiftrq $40, %k6, %k7 ; AVX512F-32-NEXT: kxorq %k1, %k7, %k7 -; AVX512F-32-NEXT: kmovd %ecx, %k1 -; AVX512F-32-NEXT: kmovq %k1, {{[0-9]+}}(%esp) # 8-byte Spill -; AVX512F-32-NEXT: movzwl %bx, %ecx -; AVX512F-32-NEXT: movl %ecx, %esi -; AVX512F-32-NEXT: movl %ecx, %edi -; AVX512F-32-NEXT: shrl $12, %ecx +; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: movzwl %bx, %esi +; AVX512F-32-NEXT: movl %esi, %edx +; AVX512F-32-NEXT: movl %esi, %edi +; AVX512F-32-NEXT: shrl $12, %esi ; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7 ; AVX512F-32-NEXT: kshiftrq $23, %k7, %k7 ; AVX512F-32-NEXT: kxorq %k7, %k6, %k6 ; AVX512F-32-NEXT: kshiftrq $41, %k6, %k7 ; AVX512F-32-NEXT: kxorq %k0, %k7, %k0 -; AVX512F-32-NEXT: kmovd %ecx, %k1 +; AVX512F-32-NEXT: kmovd %esi, %k7 ; AVX512F-32-NEXT: shrl $14, %edi ; AVX512F-32-NEXT: kshiftlq $63, %k0, %k0 ; AVX512F-32-NEXT: kshiftrq $22, %k0, %k0 ; AVX512F-32-NEXT: kxorq %k0, %k6, %k0 ; AVX512F-32-NEXT: kshiftrq $42, %k0, %k6 ; AVX512F-32-NEXT: kxorq %k3, %k6, %k3 -; AVX512F-32-NEXT: kmovd %edi, %k7 -; AVX512F-32-NEXT: shrl $15, %esi +; AVX512F-32-NEXT: kmovd %edi, %k6 +; AVX512F-32-NEXT: shrb $3, %cl ; AVX512F-32-NEXT: kshiftlq $63, %k3, %k3 ; AVX512F-32-NEXT: kshiftrq $21, %k3, %k3 -; AVX512F-32-NEXT: kxorq %k3, %k0, %k0 -; AVX512F-32-NEXT: kshiftrq $43, %k0, %k3 -; AVX512F-32-NEXT: kxorq %k4, %k3, %k3 -; AVX512F-32-NEXT: kmovd %esi, %k6 -; AVX512F-32-NEXT: shrb $3, %dl -; AVX512F-32-NEXT: kshiftlq $63, %k3, %k3 -; AVX512F-32-NEXT: kshiftrq $20, %k3, %k3 ; AVX512F-32-NEXT: kxorq %k3, %k0, %k3 -; AVX512F-32-NEXT: kshiftrq $44, %k3, %k0 -; AVX512F-32-NEXT: kxorq %k1, %k0, %k1 -; AVX512F-32-NEXT: kmovd %edx, %k0 +; AVX512F-32-NEXT: kshiftrq $43, %k3, %k0 +; AVX512F-32-NEXT: kxorq %k5, %k0, %k5 +; AVX512F-32-NEXT: kmovd %ecx, %k0 ; AVX512F-32-NEXT: movl %eax, %ecx ; AVX512F-32-NEXT: shrb $4, %cl -; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1 -; AVX512F-32-NEXT: kshiftrq $19, %k1, %k1 -; AVX512F-32-NEXT: kxorq %k1, %k3, %k1 -; AVX512F-32-NEXT: kshiftrq $45, %k1, %k3 -; AVX512F-32-NEXT: kxorq %k5, %k3, %k4 +; AVX512F-32-NEXT: kshiftlq $63, %k5, %k5 +; AVX512F-32-NEXT: kshiftrq $20, %k5, %k5 +; AVX512F-32-NEXT: kxorq %k5, %k3, %k5 +; AVX512F-32-NEXT: kshiftrq $44, %k5, %k3 +; AVX512F-32-NEXT: kxorq %k7, %k3, %k7 ; AVX512F-32-NEXT: kmovd %ecx, %k3 ; AVX512F-32-NEXT: movl %eax, %ecx ; AVX512F-32-NEXT: shrb $5, %cl ; AVX512F-32-NEXT: andb $1, %cl -; AVX512F-32-NEXT: kshiftlq $63, %k4, %k4 -; AVX512F-32-NEXT: kshiftrq $18, %k4, %k4 -; AVX512F-32-NEXT: kxorq %k4, %k1, %k1 -; AVX512F-32-NEXT: kshiftrq $46, %k1, %k4 -; AVX512F-32-NEXT: kxorq %k7, %k4, %k5 +; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7 +; AVX512F-32-NEXT: kshiftrq $19, %k7, %k7 +; AVX512F-32-NEXT: kxorq %k7, %k5, %k5 +; AVX512F-32-NEXT: kshiftrq $45, %k5, %k7 +; AVX512F-32-NEXT: kxorq %k4, %k7, %k7 ; AVX512F-32-NEXT: kmovd %ecx, %k4 ; AVX512F-32-NEXT: movl %eax, %ecx ; AVX512F-32-NEXT: shrb $6, %cl -; AVX512F-32-NEXT: kshiftlq $63, %k5, %k5 -; AVX512F-32-NEXT: kshiftrq $17, %k5, %k5 -; AVX512F-32-NEXT: kxorq %k5, %k1, %k1 -; AVX512F-32-NEXT: kshiftrq $47, %k1, %k5 +; AVX512F-32-NEXT: shrl $15, %edx +; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7 +; AVX512F-32-NEXT: kshiftrq $18, %k7, %k7 +; AVX512F-32-NEXT: kxorq %k7, %k5, %k5 +; AVX512F-32-NEXT: kshiftrq $46, %k5, %k7 +; AVX512F-32-NEXT: kxorq %k6, %k7, %k6 +; AVX512F-32-NEXT: kshiftlq $63, %k6, %k6 +; AVX512F-32-NEXT: kshiftrq $17, %k6, %k6 ; AVX512F-32-NEXT: kxorq %k6, %k5, %k5 -; AVX512F-32-NEXT: kshiftlq $63, %k5, %k5 -; AVX512F-32-NEXT: kshiftrq $16, %k5, %k5 -; AVX512F-32-NEXT: kxorq %k5, %k1, %k1 -; AVX512F-32-NEXT: kshiftrq $48, %k1, %k5 -; AVX512F-32-NEXT: kmovd %eax, %k6 +; AVX512F-32-NEXT: kshiftrq $47, %k5, %k6 +; AVX512F-32-NEXT: kmovd %edx, %k7 +; AVX512F-32-NEXT: kxorq %k7, %k6, %k6 +; AVX512F-32-NEXT: kshiftlq $63, %k6, %k6 +; AVX512F-32-NEXT: kshiftrq $16, %k6, %k6 ; AVX512F-32-NEXT: kxorq %k6, %k5, %k6 +; AVX512F-32-NEXT: kshiftrq $48, %k6, %k5 +; AVX512F-32-NEXT: kmovd %eax, %k7 +; AVX512F-32-NEXT: kxorq %k7, %k5, %k7 ; AVX512F-32-NEXT: kmovd %ecx, %k5 -; AVX512F-32-NEXT: movl %ebx, %edx -; AVX512F-32-NEXT: shrl $24, %edx +; AVX512F-32-NEXT: movl %ebx, %ecx +; AVX512F-32-NEXT: shrl $24, %ecx ; AVX512F-32-NEXT: # kill: def $al killed $al killed $eax def $eax ; AVX512F-32-NEXT: shrb $7, %al -; AVX512F-32-NEXT: kshiftlq $63, %k6, %k6 -; AVX512F-32-NEXT: kshiftrq $15, %k6, %k6 -; AVX512F-32-NEXT: kxorq %k6, %k1, %k1 -; AVX512F-32-NEXT: kshiftrq $49, %k1, %k6 -; AVX512F-32-NEXT: kxorq %k2, %k6, %k6 +; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7 +; AVX512F-32-NEXT: kshiftrq $15, %k7, %k7 +; AVX512F-32-NEXT: kxorq %k7, %k6, %k6 +; AVX512F-32-NEXT: kshiftrq $49, %k6, %k7 +; AVX512F-32-NEXT: kxorq %k2, %k7, %k7 ; AVX512F-32-NEXT: kmovd %eax, %k2 -; AVX512F-32-NEXT: movl %edx, %eax -; AVX512F-32-NEXT: kshiftlq $63, %k6, %k6 -; AVX512F-32-NEXT: kshiftrq $14, %k6, %k6 -; AVX512F-32-NEXT: kxorq %k6, %k1, %k6 -; AVX512F-32-NEXT: kshiftrq $50, %k6, %k1 -; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k7 # 8-byte Reload -; AVX512F-32-NEXT: kxorq %k7, %k1, %k7 -; AVX512F-32-NEXT: kmovd %edx, %k1 -; AVX512F-32-NEXT: # kill: def $dl killed $dl killed $edx def $edx -; AVX512F-32-NEXT: andb $15, %dl +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7 +; AVX512F-32-NEXT: kshiftrq $14, %k7, %k7 +; AVX512F-32-NEXT: kxorq %k7, %k6, %k6 +; AVX512F-32-NEXT: kshiftrq $50, %k6, %k7 +; AVX512F-32-NEXT: kxorq %k1, %k7, %k7 +; AVX512F-32-NEXT: kmovd %ecx, %k1 +; AVX512F-32-NEXT: # kill: def $cl killed $cl killed $ecx def $ecx +; AVX512F-32-NEXT: andb $15, %cl ; AVX512F-32-NEXT: andb $2, %al ; AVX512F-32-NEXT: shrb %al ; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7 @@ -2258,14 +2252,14 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: kshiftrq $51, %k6, %k7 ; AVX512F-32-NEXT: kxorq %k0, %k7, %k7 ; AVX512F-32-NEXT: kmovd %eax, %k0 -; AVX512F-32-NEXT: movl %edx, %eax -; AVX512F-32-NEXT: shrb $2, %dl +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrb $2, %cl ; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7 ; AVX512F-32-NEXT: kshiftrq $12, %k7, %k7 ; AVX512F-32-NEXT: kxorq %k7, %k6, %k6 ; AVX512F-32-NEXT: kshiftrq $52, %k6, %k7 ; AVX512F-32-NEXT: kxorq %k3, %k7, %k7 -; AVX512F-32-NEXT: kmovd %edx, %k3 +; AVX512F-32-NEXT: kmovd %ecx, %k3 ; AVX512F-32-NEXT: shrb $3, %al ; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7 ; AVX512F-32-NEXT: kshiftrq $11, %k7, %k7 @@ -2339,32 +2333,41 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1 ; AVX512F-32-NEXT: korq %k1, %k0, %k1 ; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} -; AVX512F-32-NEXT: kmovq %k0, (%esp) -; AVX512F-32-NEXT: movl (%esp), %eax -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: kshiftrq $32, %k0, %k2 +; AVX512F-32-NEXT: kmovd %k2, %eax +; AVX512F-32-NEXT: kmovd %k0, %ecx ; AVX512F-32-NEXT: vpcmpgtb %zmm0, %zmm1, %k0 {%k1} -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: kshiftrq $32, %k0, %k2 +; AVX512F-32-NEXT: kmovd %k0, %edx +; AVX512F-32-NEXT: addl %ecx, %edx +; AVX512F-32-NEXT: kmovd %k2, %ecx +; AVX512F-32-NEXT: adcl %eax, %ecx ; AVX512F-32-NEXT: vpcmpleb %zmm1, %zmm0, %k0 {%k1} -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: kshiftrq $32, %k0, %k2 +; AVX512F-32-NEXT: kmovd %k0, %eax +; AVX512F-32-NEXT: addl %edx, %eax +; AVX512F-32-NEXT: kmovd %k2, %edx +; AVX512F-32-NEXT: adcl %ecx, %edx ; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: kshiftrq $32, %k0, %k2 +; AVX512F-32-NEXT: kmovd %k0, %ecx +; AVX512F-32-NEXT: addl %eax, %ecx +; AVX512F-32-NEXT: kmovd %k2, %eax +; AVX512F-32-NEXT: adcl %edx, %eax ; AVX512F-32-NEXT: vpcmpleb %zmm0, %zmm1, %k0 {%k1} -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: kshiftrq $32, %k0, %k2 +; AVX512F-32-NEXT: kmovd %k0, %edx +; AVX512F-32-NEXT: addl %ecx, %edx +; AVX512F-32-NEXT: kmovd %k2, %ecx +; AVX512F-32-NEXT: adcl %eax, %ecx ; AVX512F-32-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: kshiftrq $32, %k0, %k1 +; AVX512F-32-NEXT: kmovd %k0, %eax +; AVX512F-32-NEXT: addl %edx, %eax +; AVX512F-32-NEXT: kmovd %k1, %edx +; AVX512F-32-NEXT: adcl %ecx, %edx ; AVX512F-32-NEXT: addl %ebp, %eax ; AVX512F-32-NEXT: adcl %ebx, %edx -; AVX512F-32-NEXT: addl $60, %esp ; AVX512F-32-NEXT: popl %esi ; AVX512F-32-NEXT: popl %edi ; AVX512F-32-NEXT: popl %ebx @@ -2419,37 +2422,52 @@ define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) { ; ; AVX512F-32-LABEL: test_ucmp_b_512: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: subl $60, %esp -; AVX512F-32-NEXT: .cfi_def_cfa_offset 64 +; AVX512F-32-NEXT: pushl %edi +; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512F-32-NEXT: pushl %esi +; AVX512F-32-NEXT: .cfi_def_cfa_offset 12 +; AVX512F-32-NEXT: .cfi_offset %esi, -12 +; AVX512F-32-NEXT: .cfi_offset %edi, -8 ; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: kshiftrq $32, %k0, %k1 +; AVX512F-32-NEXT: kmovd %k1, %eax +; AVX512F-32-NEXT: kmovd %k0, %ecx ; AVX512F-32-NEXT: vpcmpltub %zmm1, %zmm0, %k0 -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: kshiftrq $32, %k0, %k1 +; AVX512F-32-NEXT: kmovd %k1, %edx +; AVX512F-32-NEXT: kmovd %k0, %esi +; AVX512F-32-NEXT: addl %ecx, %esi +; AVX512F-32-NEXT: adcl %eax, %edx ; AVX512F-32-NEXT: vpcmpleub %zmm1, %zmm0, %k0 -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: kshiftrq $32, %k0, %k1 +; AVX512F-32-NEXT: kmovd %k1, %eax +; AVX512F-32-NEXT: kmovd %k0, %ecx +; AVX512F-32-NEXT: addl %esi, %ecx +; AVX512F-32-NEXT: adcl %edx, %eax ; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: kshiftrq $32, %k0, %k1 +; AVX512F-32-NEXT: kmovd %k1, %edx +; AVX512F-32-NEXT: kmovd %k0, %esi +; AVX512F-32-NEXT: addl %ecx, %esi +; AVX512F-32-NEXT: adcl %eax, %edx ; AVX512F-32-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: kshiftrq $32, %k0, %k1 +; AVX512F-32-NEXT: kmovd %k1, %ecx +; AVX512F-32-NEXT: kmovd %k0, %edi +; AVX512F-32-NEXT: addl %esi, %edi +; AVX512F-32-NEXT: adcl %edx, %ecx ; AVX512F-32-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 -; AVX512F-32-NEXT: kmovq %k0, (%esp) -; AVX512F-32-NEXT: addl (%esp), %eax -; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: kxnorq %k0, %k0, %k0 -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: addl $60, %esp +; AVX512F-32-NEXT: kshiftrq $32, %k0, %k1 +; AVX512F-32-NEXT: kmovd %k1, %edx +; AVX512F-32-NEXT: kmovd %k0, %eax +; AVX512F-32-NEXT: addl %edi, %eax +; AVX512F-32-NEXT: adcl %ecx, %edx +; AVX512F-32-NEXT: kxnord %k0, %k0, %k0 +; AVX512F-32-NEXT: kmovd %k0, %ecx +; AVX512F-32-NEXT: addl %ecx, %eax +; AVX512F-32-NEXT: adcl %ecx, %edx +; AVX512F-32-NEXT: popl %esi +; AVX512F-32-NEXT: popl %edi ; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1) @@ -2505,41 +2523,40 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: .cfi_def_cfa_offset 16 ; AVX512F-32-NEXT: pushl %esi ; AVX512F-32-NEXT: .cfi_def_cfa_offset 20 -; AVX512F-32-NEXT: subl $60, %esp -; AVX512F-32-NEXT: .cfi_def_cfa_offset 80 ; AVX512F-32-NEXT: .cfi_offset %esi, -20 ; AVX512F-32-NEXT: .cfi_offset %edi, -16 ; AVX512F-32-NEXT: .cfi_offset %ebx, -12 ; AVX512F-32-NEXT: .cfi_offset %ebp, -8 -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; AVX512F-32-NEXT: movl %ebx, %eax -; AVX512F-32-NEXT: shrl $16, %eax -; AVX512F-32-NEXT: movl %ebx, %edx -; AVX512F-32-NEXT: andb $15, %dl -; AVX512F-32-NEXT: movl %ebx, %ecx -; AVX512F-32-NEXT: andb $2, %cl -; AVX512F-32-NEXT: shrb %cl -; AVX512F-32-NEXT: kmovd %ecx, %k1 -; AVX512F-32-NEXT: movl %edx, %ecx -; AVX512F-32-NEXT: shrb $2, %dl -; AVX512F-32-NEXT: kmovd %edx, %k2 -; AVX512F-32-NEXT: movb %bh, %dl +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: movl %eax, %ecx +; AVX512F-32-NEXT: shrl $16, %ecx +; AVX512F-32-NEXT: movl %ecx, %esi +; AVX512F-32-NEXT: movl %eax, %ecx +; AVX512F-32-NEXT: andb $15, %cl +; AVX512F-32-NEXT: movl %eax, %edx +; AVX512F-32-NEXT: andb $2, %dl +; AVX512F-32-NEXT: shrb %dl +; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: movl %ecx, %ebx +; AVX512F-32-NEXT: shrb $2, %cl +; AVX512F-32-NEXT: kmovd %ecx, %k2 +; AVX512F-32-NEXT: movb %ah, %dl ; AVX512F-32-NEXT: andb $15, %dl -; AVX512F-32-NEXT: shrb $3, %cl -; AVX512F-32-NEXT: kmovd %ecx, %k0 -; AVX512F-32-NEXT: movl %ebx, %ecx +; AVX512F-32-NEXT: shrb $3, %bl +; AVX512F-32-NEXT: kmovd %ebx, %k0 +; AVX512F-32-NEXT: movl %eax, %ecx ; AVX512F-32-NEXT: shrb $4, %cl ; AVX512F-32-NEXT: kmovd %ecx, %k3 -; AVX512F-32-NEXT: movl %ebx, %ecx +; AVX512F-32-NEXT: movl %eax, %ecx ; AVX512F-32-NEXT: shrb $5, %cl ; AVX512F-32-NEXT: andb $1, %cl ; AVX512F-32-NEXT: kmovd %ecx, %k4 -; AVX512F-32-NEXT: movl %ebx, %ecx +; AVX512F-32-NEXT: movl %eax, %ecx ; AVX512F-32-NEXT: shrb $6, %cl ; AVX512F-32-NEXT: kmovd %ecx, %k6 -; AVX512F-32-NEXT: movl %ebx, %ecx +; AVX512F-32-NEXT: movl %eax, %ecx ; AVX512F-32-NEXT: shrb $7, %cl -; AVX512F-32-NEXT: kmovd %ebx, %k5 +; AVX512F-32-NEXT: kmovd %eax, %k5 ; AVX512F-32-NEXT: kshiftrq $1, %k5, %k7 ; AVX512F-32-NEXT: kxorq %k1, %k7, %k1 ; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1 @@ -2548,9 +2565,9 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: kshiftrq $2, %k7, %k1 ; AVX512F-32-NEXT: kxorq %k2, %k1, %k2 ; AVX512F-32-NEXT: kmovd %ecx, %k5 -; AVX512F-32-NEXT: movb %bh, %cl +; AVX512F-32-NEXT: movb %ah, %cl ; AVX512F-32-NEXT: kmovd %ecx, %k1 -; AVX512F-32-NEXT: movl %ebx, %ebp +; AVX512F-32-NEXT: movl %eax, %ebp ; AVX512F-32-NEXT: andb $2, %cl ; AVX512F-32-NEXT: shrb %cl ; AVX512F-32-NEXT: kshiftlq $63, %k2, %k2 @@ -2567,6 +2584,7 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: kshiftrq $4, %k0, %k7 ; AVX512F-32-NEXT: kxorq %k3, %k7, %k7 ; AVX512F-32-NEXT: kmovd %edx, %k3 +; AVX512F-32-NEXT: movl %esi, %eax ; AVX512F-32-NEXT: movl %eax, %edx ; AVX512F-32-NEXT: andb $15, %dl ; AVX512F-32-NEXT: shrb $3, %cl @@ -2576,7 +2594,7 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: kshiftrq $5, %k7, %k0 ; AVX512F-32-NEXT: kxorq %k4, %k0, %k4 ; AVX512F-32-NEXT: kmovd %ecx, %k0 -; AVX512F-32-NEXT: movl %ebx, %ecx +; AVX512F-32-NEXT: movl %ebp, %ecx ; AVX512F-32-NEXT: shrl $13, %ecx ; AVX512F-32-NEXT: andb $1, %cl ; AVX512F-32-NEXT: kshiftlq $63, %k4, %k4 @@ -2637,8 +2655,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: kmovd %edx, %k7 ; AVX512F-32-NEXT: kxorq %k7, %k0, %k7 ; AVX512F-32-NEXT: kmovd %ecx, %k0 -; AVX512F-32-NEXT: movl %eax, %edx -; AVX512F-32-NEXT: shrb $6, %dl +; AVX512F-32-NEXT: movl %eax, %ecx +; AVX512F-32-NEXT: shrb $6, %cl ; AVX512F-32-NEXT: shrl $15, %esi ; AVX512F-32-NEXT: shrl $14, %edi ; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7 @@ -2664,9 +2682,9 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: kshiftrq $16, %k3, %k4 ; AVX512F-32-NEXT: kmovd %eax, %k7 ; AVX512F-32-NEXT: kxorq %k7, %k4, %k4 -; AVX512F-32-NEXT: kmovd %edx, %k7 -; AVX512F-32-NEXT: movl %ebp, %edx -; AVX512F-32-NEXT: shrl $24, %edx +; AVX512F-32-NEXT: kmovd %ecx, %k7 +; AVX512F-32-NEXT: movl %ebp, %ecx +; AVX512F-32-NEXT: shrl $24, %ecx ; AVX512F-32-NEXT: # kill: def $al killed $al killed $eax def $eax ; AVX512F-32-NEXT: shrb $7, %al ; AVX512F-32-NEXT: kshiftlq $63, %k4, %k4 @@ -2675,15 +2693,15 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: kshiftrq $17, %k3, %k4 ; AVX512F-32-NEXT: kxorq %k5, %k4, %k4 ; AVX512F-32-NEXT: kmovd %eax, %k5 -; AVX512F-32-NEXT: movl %edx, %eax +; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: kshiftlq $63, %k4, %k4 ; AVX512F-32-NEXT: kshiftrq $46, %k4, %k4 ; AVX512F-32-NEXT: kxorq %k4, %k3, %k4 ; AVX512F-32-NEXT: kshiftrq $18, %k4, %k3 ; AVX512F-32-NEXT: kxorq %k6, %k3, %k6 -; AVX512F-32-NEXT: kmovd %edx, %k3 -; AVX512F-32-NEXT: # kill: def $dl killed $dl killed $edx def $edx -; AVX512F-32-NEXT: andb $15, %dl +; AVX512F-32-NEXT: kmovd %ecx, %k3 +; AVX512F-32-NEXT: # kill: def $cl killed $cl killed $ecx def $ecx +; AVX512F-32-NEXT: andb $15, %cl ; AVX512F-32-NEXT: andb $2, %al ; AVX512F-32-NEXT: shrb %al ; AVX512F-32-NEXT: kshiftlq $63, %k6, %k6 @@ -2692,23 +2710,23 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: kshiftrq $19, %k6, %k4 ; AVX512F-32-NEXT: kxorq %k1, %k4, %k1 ; AVX512F-32-NEXT: kmovd %eax, %k4 -; AVX512F-32-NEXT: movl %edx, %ecx -; AVX512F-32-NEXT: shrb $2, %dl +; AVX512F-32-NEXT: movl %ecx, %edx +; AVX512F-32-NEXT: shrb $2, %cl ; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1 ; AVX512F-32-NEXT: kshiftrq $44, %k1, %k1 ; AVX512F-32-NEXT: kxorq %k1, %k6, %k1 ; AVX512F-32-NEXT: kshiftrq $20, %k1, %k6 ; AVX512F-32-NEXT: kxorq %k2, %k6, %k6 -; AVX512F-32-NEXT: kmovd %edx, %k2 +; AVX512F-32-NEXT: kmovd %ecx, %k2 ; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: andb $15, %al -; AVX512F-32-NEXT: shrb $3, %cl +; AVX512F-32-NEXT: shrb $3, %dl ; AVX512F-32-NEXT: kshiftlq $63, %k6, %k6 ; AVX512F-32-NEXT: kshiftrq $43, %k6, %k6 ; AVX512F-32-NEXT: kxorq %k6, %k1, %k6 ; AVX512F-32-NEXT: kshiftrq $21, %k6, %k1 ; AVX512F-32-NEXT: kxorq %k0, %k1, %k0 -; AVX512F-32-NEXT: kmovd %ecx, %k1 +; AVX512F-32-NEXT: kmovd %edx, %k1 ; AVX512F-32-NEXT: movl %ebp, %ecx ; AVX512F-32-NEXT: shrl $29, %ecx ; AVX512F-32-NEXT: andb $1, %cl @@ -2718,15 +2736,15 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: kshiftrq $22, %k6, %k0 ; AVX512F-32-NEXT: kxorq %k7, %k0, %k7 ; AVX512F-32-NEXT: kmovd %ecx, %k0 -; AVX512F-32-NEXT: movl %ebx, %edx -; AVX512F-32-NEXT: andb $2, %dl -; AVX512F-32-NEXT: shrb %dl +; AVX512F-32-NEXT: movl %ebx, %ecx +; AVX512F-32-NEXT: andb $2, %cl +; AVX512F-32-NEXT: shrb %cl ; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7 ; AVX512F-32-NEXT: kshiftrq $41, %k7, %k7 ; AVX512F-32-NEXT: kxorq %k7, %k6, %k6 ; AVX512F-32-NEXT: kshiftrq $23, %k6, %k7 ; AVX512F-32-NEXT: kxorq %k5, %k7, %k7 -; AVX512F-32-NEXT: kmovd %edx, %k5 +; AVX512F-32-NEXT: kmovd %ecx, %k5 ; AVX512F-32-NEXT: movl %eax, %ecx ; AVX512F-32-NEXT: shrb $2, %al ; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7 @@ -2769,10 +2787,10 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: kmovd %edx, %k7 ; AVX512F-32-NEXT: kxorq %k7, %k4, %k7 ; AVX512F-32-NEXT: kmovd %ecx, %k4 -; AVX512F-32-NEXT: movl %ebx, %edx -; AVX512F-32-NEXT: shrb $6, %dl -; AVX512F-32-NEXT: movl %ebp, %ecx -; AVX512F-32-NEXT: shrl $31, %ecx +; AVX512F-32-NEXT: movl %ebx, %ecx +; AVX512F-32-NEXT: shrb $6, %cl +; AVX512F-32-NEXT: movl %ebp, %edx +; AVX512F-32-NEXT: shrl $31, %edx ; AVX512F-32-NEXT: movl %ebp, %esi ; AVX512F-32-NEXT: shrl $30, %esi ; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7 @@ -2790,7 +2808,7 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: kshiftrq $33, %k1, %k1 ; AVX512F-32-NEXT: kxorq %k1, %k0, %k0 ; AVX512F-32-NEXT: kshiftrq $31, %k0, %k1 -; AVX512F-32-NEXT: kmovd %ecx, %k7 +; AVX512F-32-NEXT: kmovd %edx, %k7 ; AVX512F-32-NEXT: kxorq %k7, %k1, %k1 ; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1 ; AVX512F-32-NEXT: kshiftrq $32, %k1, %k1 @@ -2798,7 +2816,7 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: kshiftrq $32, %k0, %k1 ; AVX512F-32-NEXT: kmovd %ebx, %k7 ; AVX512F-32-NEXT: kxorq %k7, %k1, %k1 -; AVX512F-32-NEXT: kmovd %edx, %k7 +; AVX512F-32-NEXT: kmovd %ecx, %k7 ; AVX512F-32-NEXT: movl %ebx, %ecx ; AVX512F-32-NEXT: shrb $7, %cl ; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1 @@ -2837,119 +2855,117 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: kshiftrq $27, %k2, %k2 ; AVX512F-32-NEXT: kxorq %k2, %k5, %k2 ; AVX512F-32-NEXT: kshiftrq $37, %k2, %k5 -; AVX512F-32-NEXT: kxorq %k4, %k5, %k5 -; AVX512F-32-NEXT: kmovd %ecx, %k4 +; AVX512F-32-NEXT: kxorq %k4, %k5, %k4 +; AVX512F-32-NEXT: kmovd %ecx, %k5 ; AVX512F-32-NEXT: movl %ebx, %ecx ; AVX512F-32-NEXT: shrl $13, %ecx ; AVX512F-32-NEXT: andb $1, %cl -; AVX512F-32-NEXT: kshiftlq $63, %k5, %k5 -; AVX512F-32-NEXT: kshiftrq $26, %k5, %k5 -; AVX512F-32-NEXT: kxorq %k5, %k2, %k2 -; AVX512F-32-NEXT: kshiftrq $38, %k2, %k5 -; AVX512F-32-NEXT: kxorq %k7, %k5, %k7 -; AVX512F-32-NEXT: kmovd %ecx, %k5 -; AVX512F-32-NEXT: movl %eax, %edx -; AVX512F-32-NEXT: andb $2, %dl -; AVX512F-32-NEXT: shrb %dl +; AVX512F-32-NEXT: kshiftlq $63, %k4, %k4 +; AVX512F-32-NEXT: kshiftrq $26, %k4, %k4 +; AVX512F-32-NEXT: kxorq %k4, %k2, %k2 +; AVX512F-32-NEXT: kshiftrq $38, %k2, %k4 +; AVX512F-32-NEXT: kxorq %k7, %k4, %k7 +; AVX512F-32-NEXT: kmovd %ecx, %k4 +; AVX512F-32-NEXT: movl %eax, %ecx +; AVX512F-32-NEXT: andb $2, %cl +; AVX512F-32-NEXT: shrb %cl ; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7 ; AVX512F-32-NEXT: kshiftrq $25, %k7, %k7 ; AVX512F-32-NEXT: kxorq %k7, %k2, %k7 ; AVX512F-32-NEXT: kshiftrq $39, %k7, %k2 ; AVX512F-32-NEXT: kxorq %k6, %k2, %k6 -; AVX512F-32-NEXT: kmovd %edx, %k2 -; AVX512F-32-NEXT: movl %eax, %ecx -; AVX512F-32-NEXT: andb $15, %cl -; AVX512F-32-NEXT: movl %ecx, %edx -; AVX512F-32-NEXT: shrb $2, %cl +; AVX512F-32-NEXT: kmovd %ecx, %k2 +; AVX512F-32-NEXT: movl %eax, %edx +; AVX512F-32-NEXT: andb $15, %dl +; AVX512F-32-NEXT: movl %edx, %ecx +; AVX512F-32-NEXT: shrb $2, %dl ; AVX512F-32-NEXT: kshiftlq $63, %k6, %k6 ; AVX512F-32-NEXT: kshiftrq $24, %k6, %k6 ; AVX512F-32-NEXT: kxorq %k6, %k7, %k6 ; AVX512F-32-NEXT: kshiftrq $40, %k6, %k7 ; AVX512F-32-NEXT: kxorq %k1, %k7, %k7 -; AVX512F-32-NEXT: kmovd %ecx, %k1 -; AVX512F-32-NEXT: kmovq %k1, {{[0-9]+}}(%esp) # 8-byte Spill -; AVX512F-32-NEXT: movzwl %bx, %ecx -; AVX512F-32-NEXT: movl %ecx, %esi -; AVX512F-32-NEXT: movl %ecx, %edi -; AVX512F-32-NEXT: shrl $12, %ecx +; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: movzwl %bx, %esi +; AVX512F-32-NEXT: movl %esi, %edx +; AVX512F-32-NEXT: movl %esi, %edi +; AVX512F-32-NEXT: shrl $12, %esi ; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7 ; AVX512F-32-NEXT: kshiftrq $23, %k7, %k7 ; AVX512F-32-NEXT: kxorq %k7, %k6, %k6 ; AVX512F-32-NEXT: kshiftrq $41, %k6, %k7 ; AVX512F-32-NEXT: kxorq %k0, %k7, %k0 -; AVX512F-32-NEXT: kmovd %ecx, %k1 +; AVX512F-32-NEXT: kmovd %esi, %k7 ; AVX512F-32-NEXT: shrl $14, %edi ; AVX512F-32-NEXT: kshiftlq $63, %k0, %k0 ; AVX512F-32-NEXT: kshiftrq $22, %k0, %k0 ; AVX512F-32-NEXT: kxorq %k0, %k6, %k0 ; AVX512F-32-NEXT: kshiftrq $42, %k0, %k6 ; AVX512F-32-NEXT: kxorq %k3, %k6, %k3 -; AVX512F-32-NEXT: kmovd %edi, %k7 -; AVX512F-32-NEXT: shrl $15, %esi +; AVX512F-32-NEXT: kmovd %edi, %k6 +; AVX512F-32-NEXT: shrb $3, %cl ; AVX512F-32-NEXT: kshiftlq $63, %k3, %k3 ; AVX512F-32-NEXT: kshiftrq $21, %k3, %k3 -; AVX512F-32-NEXT: kxorq %k3, %k0, %k0 -; AVX512F-32-NEXT: kshiftrq $43, %k0, %k3 -; AVX512F-32-NEXT: kxorq %k4, %k3, %k3 -; AVX512F-32-NEXT: kmovd %esi, %k6 -; AVX512F-32-NEXT: shrb $3, %dl -; AVX512F-32-NEXT: kshiftlq $63, %k3, %k3 -; AVX512F-32-NEXT: kshiftrq $20, %k3, %k3 ; AVX512F-32-NEXT: kxorq %k3, %k0, %k3 -; AVX512F-32-NEXT: kshiftrq $44, %k3, %k0 -; AVX512F-32-NEXT: kxorq %k1, %k0, %k1 -; AVX512F-32-NEXT: kmovd %edx, %k0 +; AVX512F-32-NEXT: kshiftrq $43, %k3, %k0 +; AVX512F-32-NEXT: kxorq %k5, %k0, %k5 +; AVX512F-32-NEXT: kmovd %ecx, %k0 ; AVX512F-32-NEXT: movl %eax, %ecx ; AVX512F-32-NEXT: shrb $4, %cl -; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1 -; AVX512F-32-NEXT: kshiftrq $19, %k1, %k1 -; AVX512F-32-NEXT: kxorq %k1, %k3, %k1 -; AVX512F-32-NEXT: kshiftrq $45, %k1, %k3 -; AVX512F-32-NEXT: kxorq %k5, %k3, %k4 +; AVX512F-32-NEXT: kshiftlq $63, %k5, %k5 +; AVX512F-32-NEXT: kshiftrq $20, %k5, %k5 +; AVX512F-32-NEXT: kxorq %k5, %k3, %k5 +; AVX512F-32-NEXT: kshiftrq $44, %k5, %k3 +; AVX512F-32-NEXT: kxorq %k7, %k3, %k7 ; AVX512F-32-NEXT: kmovd %ecx, %k3 ; AVX512F-32-NEXT: movl %eax, %ecx ; AVX512F-32-NEXT: shrb $5, %cl ; AVX512F-32-NEXT: andb $1, %cl -; AVX512F-32-NEXT: kshiftlq $63, %k4, %k4 -; AVX512F-32-NEXT: kshiftrq $18, %k4, %k4 -; AVX512F-32-NEXT: kxorq %k4, %k1, %k1 -; AVX512F-32-NEXT: kshiftrq $46, %k1, %k4 -; AVX512F-32-NEXT: kxorq %k7, %k4, %k5 +; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7 +; AVX512F-32-NEXT: kshiftrq $19, %k7, %k7 +; AVX512F-32-NEXT: kxorq %k7, %k5, %k5 +; AVX512F-32-NEXT: kshiftrq $45, %k5, %k7 +; AVX512F-32-NEXT: kxorq %k4, %k7, %k7 ; AVX512F-32-NEXT: kmovd %ecx, %k4 ; AVX512F-32-NEXT: movl %eax, %ecx ; AVX512F-32-NEXT: shrb $6, %cl -; AVX512F-32-NEXT: kshiftlq $63, %k5, %k5 -; AVX512F-32-NEXT: kshiftrq $17, %k5, %k5 -; AVX512F-32-NEXT: kxorq %k5, %k1, %k1 -; AVX512F-32-NEXT: kshiftrq $47, %k1, %k5 +; AVX512F-32-NEXT: shrl $15, %edx +; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7 +; AVX512F-32-NEXT: kshiftrq $18, %k7, %k7 +; AVX512F-32-NEXT: kxorq %k7, %k5, %k5 +; AVX512F-32-NEXT: kshiftrq $46, %k5, %k7 +; AVX512F-32-NEXT: kxorq %k6, %k7, %k6 +; AVX512F-32-NEXT: kshiftlq $63, %k6, %k6 +; AVX512F-32-NEXT: kshiftrq $17, %k6, %k6 ; AVX512F-32-NEXT: kxorq %k6, %k5, %k5 -; AVX512F-32-NEXT: kshiftlq $63, %k5, %k5 -; AVX512F-32-NEXT: kshiftrq $16, %k5, %k5 -; AVX512F-32-NEXT: kxorq %k5, %k1, %k1 -; AVX512F-32-NEXT: kshiftrq $48, %k1, %k5 -; AVX512F-32-NEXT: kmovd %eax, %k6 +; AVX512F-32-NEXT: kshiftrq $47, %k5, %k6 +; AVX512F-32-NEXT: kmovd %edx, %k7 +; AVX512F-32-NEXT: kxorq %k7, %k6, %k6 +; AVX512F-32-NEXT: kshiftlq $63, %k6, %k6 +; AVX512F-32-NEXT: kshiftrq $16, %k6, %k6 ; AVX512F-32-NEXT: kxorq %k6, %k5, %k6 +; AVX512F-32-NEXT: kshiftrq $48, %k6, %k5 +; AVX512F-32-NEXT: kmovd %eax, %k7 +; AVX512F-32-NEXT: kxorq %k7, %k5, %k7 ; AVX512F-32-NEXT: kmovd %ecx, %k5 -; AVX512F-32-NEXT: movl %ebx, %edx -; AVX512F-32-NEXT: shrl $24, %edx +; AVX512F-32-NEXT: movl %ebx, %ecx +; AVX512F-32-NEXT: shrl $24, %ecx ; AVX512F-32-NEXT: # kill: def $al killed $al killed $eax def $eax ; AVX512F-32-NEXT: shrb $7, %al -; AVX512F-32-NEXT: kshiftlq $63, %k6, %k6 -; AVX512F-32-NEXT: kshiftrq $15, %k6, %k6 -; AVX512F-32-NEXT: kxorq %k6, %k1, %k1 -; AVX512F-32-NEXT: kshiftrq $49, %k1, %k6 -; AVX512F-32-NEXT: kxorq %k2, %k6, %k6 +; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7 +; AVX512F-32-NEXT: kshiftrq $15, %k7, %k7 +; AVX512F-32-NEXT: kxorq %k7, %k6, %k6 +; AVX512F-32-NEXT: kshiftrq $49, %k6, %k7 +; AVX512F-32-NEXT: kxorq %k2, %k7, %k7 ; AVX512F-32-NEXT: kmovd %eax, %k2 -; AVX512F-32-NEXT: movl %edx, %eax -; AVX512F-32-NEXT: kshiftlq $63, %k6, %k6 -; AVX512F-32-NEXT: kshiftrq $14, %k6, %k6 -; AVX512F-32-NEXT: kxorq %k6, %k1, %k6 -; AVX512F-32-NEXT: kshiftrq $50, %k6, %k1 -; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k7 # 8-byte Reload -; AVX512F-32-NEXT: kxorq %k7, %k1, %k7 -; AVX512F-32-NEXT: kmovd %edx, %k1 -; AVX512F-32-NEXT: # kill: def $dl killed $dl killed $edx def $edx -; AVX512F-32-NEXT: andb $15, %dl +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7 +; AVX512F-32-NEXT: kshiftrq $14, %k7, %k7 +; AVX512F-32-NEXT: kxorq %k7, %k6, %k6 +; AVX512F-32-NEXT: kshiftrq $50, %k6, %k7 +; AVX512F-32-NEXT: kxorq %k1, %k7, %k7 +; AVX512F-32-NEXT: kmovd %ecx, %k1 +; AVX512F-32-NEXT: # kill: def $cl killed $cl killed $ecx def $ecx +; AVX512F-32-NEXT: andb $15, %cl ; AVX512F-32-NEXT: andb $2, %al ; AVX512F-32-NEXT: shrb %al ; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7 @@ -2958,14 +2974,14 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: kshiftrq $51, %k6, %k7 ; AVX512F-32-NEXT: kxorq %k0, %k7, %k7 ; AVX512F-32-NEXT: kmovd %eax, %k0 -; AVX512F-32-NEXT: movl %edx, %eax -; AVX512F-32-NEXT: shrb $2, %dl +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrb $2, %cl ; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7 ; AVX512F-32-NEXT: kshiftrq $12, %k7, %k7 ; AVX512F-32-NEXT: kxorq %k7, %k6, %k6 ; AVX512F-32-NEXT: kshiftrq $52, %k6, %k7 ; AVX512F-32-NEXT: kxorq %k3, %k7, %k7 -; AVX512F-32-NEXT: kmovd %edx, %k3 +; AVX512F-32-NEXT: kmovd %ecx, %k3 ; AVX512F-32-NEXT: shrb $3, %al ; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7 ; AVX512F-32-NEXT: kshiftrq $11, %k7, %k7 @@ -3039,32 +3055,41 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1 ; AVX512F-32-NEXT: korq %k1, %k0, %k1 ; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} -; AVX512F-32-NEXT: kmovq %k0, (%esp) -; AVX512F-32-NEXT: movl (%esp), %eax -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: kshiftrq $32, %k0, %k2 +; AVX512F-32-NEXT: kmovd %k2, %eax +; AVX512F-32-NEXT: kmovd %k0, %ecx ; AVX512F-32-NEXT: vpcmpltub %zmm1, %zmm0, %k0 {%k1} -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: kshiftrq $32, %k0, %k2 +; AVX512F-32-NEXT: kmovd %k0, %edx +; AVX512F-32-NEXT: addl %ecx, %edx +; AVX512F-32-NEXT: kmovd %k2, %ecx +; AVX512F-32-NEXT: adcl %eax, %ecx ; AVX512F-32-NEXT: vpcmpleub %zmm1, %zmm0, %k0 {%k1} -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: kshiftrq $32, %k0, %k2 +; AVX512F-32-NEXT: kmovd %k0, %eax +; AVX512F-32-NEXT: addl %edx, %eax +; AVX512F-32-NEXT: kmovd %k2, %edx +; AVX512F-32-NEXT: adcl %ecx, %edx ; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: kshiftrq $32, %k0, %k2 +; AVX512F-32-NEXT: kmovd %k0, %ecx +; AVX512F-32-NEXT: addl %eax, %ecx +; AVX512F-32-NEXT: kmovd %k2, %eax +; AVX512F-32-NEXT: adcl %edx, %eax ; AVX512F-32-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: kshiftrq $32, %k0, %k2 +; AVX512F-32-NEXT: kmovd %k0, %edx +; AVX512F-32-NEXT: addl %ecx, %edx +; AVX512F-32-NEXT: kmovd %k2, %ecx +; AVX512F-32-NEXT: adcl %eax, %ecx ; AVX512F-32-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 {%k1} -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: kshiftrq $32, %k0, %k1 +; AVX512F-32-NEXT: kmovd %k0, %eax +; AVX512F-32-NEXT: addl %edx, %eax +; AVX512F-32-NEXT: kmovd %k1, %edx +; AVX512F-32-NEXT: adcl %ecx, %edx ; AVX512F-32-NEXT: addl %ebp, %eax ; AVX512F-32-NEXT: adcl %ebx, %edx -; AVX512F-32-NEXT: addl $60, %esp ; AVX512F-32-NEXT: popl %esi ; AVX512F-32-NEXT: popl %edi ; AVX512F-32-NEXT: popl %ebx @@ -3487,18 +3512,21 @@ define i64@test_int_x86_avx512_ptestm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x ; ; AVX512F-32-LABEL: test_int_x86_avx512_ptestm_b_512: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: subl $20, %esp -; AVX512F-32-NEXT: .cfi_def_cfa_offset 24 +; AVX512F-32-NEXT: pushl %esi +; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512F-32-NEXT: .cfi_offset %esi, -8 ; AVX512F-32-NEXT: vptestmb %zmm1, %zmm0, %k0 ; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vptestmb %zmm1, %zmm0, %k1 {%k1} -; AVX512F-32-NEXT: kmovq %k1, (%esp) -; AVX512F-32-NEXT: movl (%esp), %eax -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: addl $20, %esp +; AVX512F-32-NEXT: kshiftrq $32, %k1, %k2 +; AVX512F-32-NEXT: kmovd %k2, %ecx +; AVX512F-32-NEXT: kmovd %k1, %esi +; AVX512F-32-NEXT: kshiftrq $32, %k0, %k1 +; AVX512F-32-NEXT: kmovd %k1, %edx +; AVX512F-32-NEXT: kmovd %k0, %eax +; AVX512F-32-NEXT: addl %esi, %eax +; AVX512F-32-NEXT: adcl %ecx, %edx +; AVX512F-32-NEXT: popl %esi ; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res = call i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) @@ -3553,18 +3581,21 @@ define i64@test_int_x86_avx512_ptestnm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 % ; ; AVX512F-32-LABEL: test_int_x86_avx512_ptestnm_b_512: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: subl $20, %esp -; AVX512F-32-NEXT: .cfi_def_cfa_offset 24 +; AVX512F-32-NEXT: pushl %esi +; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512F-32-NEXT: .cfi_offset %esi, -8 ; AVX512F-32-NEXT: vptestnmb %zmm1, %zmm0, %k0 ; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vptestnmb %zmm1, %zmm0, %k1 {%k1} -; AVX512F-32-NEXT: kmovq %k1, (%esp) -; AVX512F-32-NEXT: movl (%esp), %eax -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: addl $20, %esp +; AVX512F-32-NEXT: kshiftrq $32, %k1, %k2 +; AVX512F-32-NEXT: kmovd %k2, %ecx +; AVX512F-32-NEXT: kmovd %k1, %esi +; AVX512F-32-NEXT: kshiftrq $32, %k0, %k1 +; AVX512F-32-NEXT: kmovd %k1, %edx +; AVX512F-32-NEXT: kmovd %k0, %eax +; AVX512F-32-NEXT: addl %esi, %eax +; AVX512F-32-NEXT: adcl %ecx, %edx +; AVX512F-32-NEXT: popl %esi ; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res = call i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) @@ -3615,13 +3646,10 @@ define i64@test_int_x86_avx512_cvtb2mask_512(<64 x i8> %x0) { ; ; AVX512F-32-LABEL: test_int_x86_avx512_cvtb2mask_512: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: subl $12, %esp -; AVX512F-32-NEXT: .cfi_def_cfa_offset 16 ; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 -; AVX512F-32-NEXT: kmovq %k0, (%esp) -; AVX512F-32-NEXT: movl (%esp), %eax -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: addl $12, %esp +; AVX512F-32-NEXT: kshiftrq $32, %k0, %k1 +; AVX512F-32-NEXT: kmovd %k0, %eax +; AVX512F-32-NEXT: kmovd %k1, %edx ; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res = call i64 @llvm.x86.avx512.cvtb2mask.512(<64 x i8> %x0) |