diff options
author | Craig Topper <craig.topper@intel.com> | 2019-12-10 14:44:38 -0800 |
---|---|---|
committer | Craig Topper <craig.topper@intel.com> | 2019-12-10 15:07:55 -0800 |
commit | 88dacbd43625cf7aad8a01c0c3b92142c4dc0970 (patch) | |
tree | 2eec242db6fc73217ba6d66511625a39d7283113 | |
parent | 1d41d1bcdfd70cf8f77bb32e2617392395c299a4 (diff) | |
download | bcm5719-llvm-88dacbd43625cf7aad8a01c0c3b92142c4dc0970.tar.gz bcm5719-llvm-88dacbd43625cf7aad8a01c0c3b92142c4dc0970.zip |
[X86] Go back to considering v64i1 as a legal type under min-legal-vector-width=256. Scalarize v64i1 arguments and shuffles under min-legal-vector-width=256.
This reverts 3e1aee2ba717529b651a79ed4fc7e7147358043f in favor
of a different approach.
Scalarizing isn't great codegen, but making the type illegal was
interfering with k constraint in inline assembly.
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 79 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/min-legal-vector-width.ll | 918 |
2 files changed, 937 insertions, 60 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 7733ad66162..703a3af1918 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1638,32 +1638,38 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } } - // This block control legalization of v32i1 which is available with + // This block control legalization of v32i1/v64i1 which are available with // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with - // useBWIRegs. v64i1 is also controled with useBWIRegs. + // useBWIRegs. if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) { addRegisterClass(MVT::v32i1, &X86::VK32RegClass); + addRegisterClass(MVT::v64i1, &X86::VK64RegClass); - setOperationAction(ISD::ADD, MVT::v32i1, Custom); - setOperationAction(ISD::SUB, MVT::v32i1, Custom); - setOperationAction(ISD::MUL, MVT::v32i1, Custom); - setOperationAction(ISD::VSELECT, MVT::v32i1, Expand); - setOperationAction(ISD::UADDSAT, MVT::v32i1, Custom); - setOperationAction(ISD::SADDSAT, MVT::v32i1, Custom); - setOperationAction(ISD::USUBSAT, MVT::v32i1, Custom); - setOperationAction(ISD::SSUBSAT, MVT::v32i1, Custom); - - setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom); - setOperationAction(ISD::SETCC, MVT::v32i1, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom); - setOperationAction(ISD::SELECT, MVT::v32i1, Custom); - setOperationAction(ISD::BUILD_VECTOR, MVT::v32i1, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom); + for (auto VT : { MVT::v32i1, MVT::v64i1 }) { + setOperationAction(ISD::ADD, VT, Custom); + setOperationAction(ISD::SUB, VT, Custom); + setOperationAction(ISD::MUL, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Expand); + setOperationAction(ISD::UADDSAT, VT, Custom); + setOperationAction(ISD::SADDSAT, VT, Custom); + setOperationAction(ISD::USUBSAT, VT, Custom); + setOperationAction(ISD::SSUBSAT, VT, Custom); + + setOperationAction(ISD::TRUNCATE, VT, Custom); + setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + } setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom); + for (auto VT : { MVT::v16i1, MVT::v32i1 }) + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); // Extends from v32i1 masks to 256-bit vectors. setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom); @@ -1753,34 +1759,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FSHL, MVT::v32i16, Custom); setOperationAction(ISD::FSHR, MVT::v32i16, Custom); } - - // Only support v64i1 if we support v64i8. Without 64i8 we won't have any - // operations that can produce these values other than concatenating - // v32i1 vectors together. And we don't have any masked operations that - // need a v64i1. By making it legal we avoid needing to lower arbitrary - // shuffles of v64i1 which need v64i8 to be legal. - addRegisterClass(MVT::v64i1, &X86::VK64RegClass); - - setOperationAction(ISD::ADD, MVT::v64i1, Custom); - setOperationAction(ISD::SUB, MVT::v64i1, Custom); - setOperationAction(ISD::MUL, MVT::v64i1, Custom); - setOperationAction(ISD::VSELECT, MVT::v64i1, Expand); - setOperationAction(ISD::UADDSAT, MVT::v64i1, Custom); - setOperationAction(ISD::SADDSAT, MVT::v64i1, Custom); - setOperationAction(ISD::USUBSAT, MVT::v64i1, Custom); - setOperationAction(ISD::SSUBSAT, MVT::v64i1, Custom); - - setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom); - setOperationAction(ISD::SETCC, MVT::v64i1, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom); - setOperationAction(ISD::SELECT, MVT::v64i1, Custom); - setOperationAction(ISD::BUILD_VECTOR, MVT::v64i1, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom); - - setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i1, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) { @@ -2020,6 +1998,7 @@ MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, Subtarget.hasAVX512() && (!isPowerOf2_32(VT.getVectorNumElements()) || (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) || + (VT.getVectorNumElements() > 32 && !Subtarget.useBWIRegs()) || (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) return MVT::i8; // FIXME: Should we just make these types legal and custom split operations? @@ -2040,6 +2019,7 @@ unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, Subtarget.hasAVX512() && (!isPowerOf2_32(VT.getVectorNumElements()) || (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) || + (VT.getVectorNumElements() > 32 && !Subtarget.useBWIRegs()) || (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) return VT.getVectorNumElements(); // FIXME: Should we just make these types legal and custom split operations? @@ -2057,6 +2037,7 @@ unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv( Subtarget.hasAVX512() && (!isPowerOf2_32(VT.getVectorNumElements()) || (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) || + (VT.getVectorNumElements() > 32 && !Subtarget.useBWIRegs()) || (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) { RegisterVT = MVT::i8; IntermediateVT = MVT::i1; @@ -17041,6 +17022,10 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8; break; case MVT::v64i1: + // Fall back to scalarization. FIXME: We can do better if the shuffle + // can be partitioned cleanly. + if (!Subtarget.useBWIRegs()) + return SDValue(); ExtVT = MVT::v64i8; break; } diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll index 6e256c060d2..bf48a305a2b 100644 --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=fast-variable-shuffle,avx512vl,avx512bw,avx512dq,prefer-256-bit | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=fast-variable-shuffle,avx512vl,avx512bw,avx512dq,prefer-256-bit,avx512vbmi | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit,avx512vbmi | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI ; Make sure CPUs default to prefer-256-bit. avx512vnni isn't interesting as it just adds an isel peephole for vpmaddwd+vpaddd ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=cascadelake | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512 @@ -1120,6 +1120,448 @@ define void @trunc_packus_v16i32_v16i8_store(<16 x i32>* %p, <16 x i8>* %q) "min define <64 x i1> @v64i1_argument_return(<64 x i1> %x) "min-legal-vector-width"="256" { ; CHECK-LABEL: v64i1_argument_return: ; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: kmovd %esi, %k0 +; CHECK-NEXT: kshiftlq $63, %k0, %k0 +; CHECK-NEXT: kshiftrq $63, %k0, %k0 +; CHECK-NEXT: kshiftlq $2, %k0, %k1 +; CHECK-NEXT: kmovd %edx, %k2 +; CHECK-NEXT: kshiftlq $1, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $62, %k0, %k0 +; CHECK-NEXT: kshiftrq $62, %k0, %k0 +; CHECK-NEXT: kshiftlq $3, %k0, %k1 +; CHECK-NEXT: kmovd %ecx, %k2 +; CHECK-NEXT: kshiftlq $2, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $61, %k0, %k0 +; CHECK-NEXT: kshiftrq $61, %k0, %k0 +; CHECK-NEXT: kshiftlq $4, %k0, %k1 +; CHECK-NEXT: kmovd %r8d, %k2 +; CHECK-NEXT: kshiftlq $3, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $60, %k0, %k0 +; CHECK-NEXT: kshiftrq $60, %k0, %k0 +; CHECK-NEXT: kshiftlq $5, %k0, %k1 +; CHECK-NEXT: kmovd %r9d, %k2 +; CHECK-NEXT: kshiftlq $4, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $59, %k0, %k0 +; CHECK-NEXT: kshiftrq $59, %k0, %k0 +; CHECK-NEXT: kshiftlq $6, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $5, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $58, %k0, %k0 +; CHECK-NEXT: kshiftrq $58, %k0, %k0 +; CHECK-NEXT: kshiftlq $7, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $6, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $57, %k0, %k0 +; CHECK-NEXT: kshiftrq $57, %k0, %k0 +; CHECK-NEXT: kshiftlq $8, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $7, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $56, %k0, %k0 +; CHECK-NEXT: kshiftrq $56, %k0, %k0 +; CHECK-NEXT: kshiftlq $9, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $8, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $55, %k0, %k0 +; CHECK-NEXT: kshiftrq $55, %k0, %k0 +; CHECK-NEXT: kshiftlq $10, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $9, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $54, %k0, %k0 +; CHECK-NEXT: kshiftrq $54, %k0, %k0 +; CHECK-NEXT: kshiftlq $11, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $10, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $53, %k0, %k0 +; CHECK-NEXT: kshiftrq $53, %k0, %k0 +; CHECK-NEXT: kshiftlq $12, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $11, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $52, %k0, %k0 +; CHECK-NEXT: kshiftrq $52, %k0, %k0 +; CHECK-NEXT: kshiftlq $13, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $12, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $51, %k0, %k0 +; CHECK-NEXT: kshiftrq $51, %k0, %k0 +; CHECK-NEXT: kshiftlq $14, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $13, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $50, %k0, %k0 +; CHECK-NEXT: kshiftrq $50, %k0, %k0 +; CHECK-NEXT: kshiftlq $15, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $14, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $49, %k0, %k0 +; CHECK-NEXT: kshiftrq $49, %k0, %k0 +; CHECK-NEXT: kshiftlq $16, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $15, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $48, %k0, %k0 +; CHECK-NEXT: kshiftrq $48, %k0, %k0 +; CHECK-NEXT: kshiftlq $17, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $16, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $47, %k0, %k0 +; CHECK-NEXT: kshiftrq $47, %k0, %k0 +; CHECK-NEXT: kshiftlq $18, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $17, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $46, %k0, %k0 +; CHECK-NEXT: kshiftrq $46, %k0, %k0 +; CHECK-NEXT: kshiftlq $19, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $18, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $45, %k0, %k0 +; CHECK-NEXT: kshiftrq $45, %k0, %k0 +; CHECK-NEXT: kshiftlq $20, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $19, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $44, %k0, %k0 +; CHECK-NEXT: kshiftrq $44, %k0, %k0 +; CHECK-NEXT: kshiftlq $21, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $20, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $43, %k0, %k0 +; CHECK-NEXT: kshiftrq $43, %k0, %k0 +; CHECK-NEXT: kshiftlq $22, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $21, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $42, %k0, %k0 +; CHECK-NEXT: kshiftrq $42, %k0, %k0 +; CHECK-NEXT: kshiftlq $23, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $22, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $41, %k0, %k0 +; CHECK-NEXT: kshiftrq $41, %k0, %k0 +; CHECK-NEXT: kshiftlq $24, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $23, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $40, %k0, %k0 +; CHECK-NEXT: kshiftrq $40, %k0, %k0 +; CHECK-NEXT: kshiftlq $25, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $24, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $39, %k0, %k0 +; CHECK-NEXT: kshiftrq $39, %k0, %k0 +; CHECK-NEXT: kshiftlq $26, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $25, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $38, %k0, %k0 +; CHECK-NEXT: kshiftrq $38, %k0, %k0 +; CHECK-NEXT: kshiftlq $27, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $26, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $37, %k0, %k0 +; CHECK-NEXT: kshiftrq $37, %k0, %k0 +; CHECK-NEXT: kshiftlq $28, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $27, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $36, %k0, %k0 +; CHECK-NEXT: kshiftrq $36, %k0, %k0 +; CHECK-NEXT: kshiftlq $29, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $28, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $35, %k0, %k0 +; CHECK-NEXT: kshiftrq $35, %k0, %k0 +; CHECK-NEXT: kshiftlq $30, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $29, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $34, %k0, %k0 +; CHECK-NEXT: kshiftrq $34, %k0, %k0 +; CHECK-NEXT: kshiftlq $31, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $30, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $33, %k0, %k0 +; CHECK-NEXT: kshiftrq $33, %k0, %k0 +; CHECK-NEXT: kshiftlq $32, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $31, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $32, %k0, %k0 +; CHECK-NEXT: kshiftrq $32, %k0, %k0 +; CHECK-NEXT: kshiftlq $33, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $32, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $31, %k0, %k0 +; CHECK-NEXT: kshiftrq $31, %k0, %k0 +; CHECK-NEXT: kshiftlq $34, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $33, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $30, %k0, %k0 +; CHECK-NEXT: kshiftrq $30, %k0, %k0 +; CHECK-NEXT: kshiftlq $35, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $34, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $29, %k0, %k0 +; CHECK-NEXT: kshiftrq $29, %k0, %k0 +; CHECK-NEXT: kshiftlq $36, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $35, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $28, %k0, %k0 +; CHECK-NEXT: kshiftrq $28, %k0, %k0 +; CHECK-NEXT: kshiftlq $37, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $36, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $27, %k0, %k0 +; CHECK-NEXT: kshiftrq $27, %k0, %k0 +; CHECK-NEXT: kshiftlq $38, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $37, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $26, %k0, %k0 +; CHECK-NEXT: kshiftrq $26, %k0, %k0 +; CHECK-NEXT: kshiftlq $39, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $38, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $25, %k0, %k0 +; CHECK-NEXT: kshiftrq $25, %k0, %k0 +; CHECK-NEXT: kshiftlq $40, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $39, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $24, %k0, %k0 +; CHECK-NEXT: kshiftrq $24, %k0, %k0 +; CHECK-NEXT: kshiftlq $41, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $40, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $23, %k0, %k0 +; CHECK-NEXT: kshiftrq $23, %k0, %k0 +; CHECK-NEXT: kshiftlq $42, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $41, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $22, %k0, %k0 +; CHECK-NEXT: kshiftrq $22, %k0, %k0 +; CHECK-NEXT: kshiftlq $43, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $42, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $21, %k0, %k0 +; CHECK-NEXT: kshiftrq $21, %k0, %k0 +; CHECK-NEXT: kshiftlq $44, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $43, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $20, %k0, %k0 +; CHECK-NEXT: kshiftrq $20, %k0, %k0 +; CHECK-NEXT: kshiftlq $45, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $44, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $19, %k0, %k0 +; CHECK-NEXT: kshiftrq $19, %k0, %k0 +; CHECK-NEXT: kshiftlq $46, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $45, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $18, %k0, %k0 +; CHECK-NEXT: kshiftrq $18, %k0, %k0 +; CHECK-NEXT: kshiftlq $47, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $46, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $17, %k0, %k0 +; CHECK-NEXT: kshiftrq $17, %k0, %k0 +; CHECK-NEXT: kshiftlq $48, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $47, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $16, %k0, %k0 +; CHECK-NEXT: kshiftrq $16, %k0, %k0 +; CHECK-NEXT: kshiftlq $49, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $48, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $15, %k0, %k0 +; CHECK-NEXT: kshiftrq $15, %k0, %k0 +; CHECK-NEXT: kshiftlq $50, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $49, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $14, %k0, %k0 +; CHECK-NEXT: kshiftrq $14, %k0, %k0 +; CHECK-NEXT: kshiftlq $51, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $50, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $13, %k0, %k0 +; CHECK-NEXT: kshiftrq $13, %k0, %k0 +; CHECK-NEXT: kshiftlq $52, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $51, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $12, %k0, %k0 +; CHECK-NEXT: kshiftrq $12, %k0, %k0 +; CHECK-NEXT: kshiftlq $53, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $52, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $11, %k0, %k0 +; CHECK-NEXT: kshiftrq $11, %k0, %k0 +; CHECK-NEXT: kshiftlq $54, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $53, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $10, %k0, %k0 +; CHECK-NEXT: kshiftrq $10, %k0, %k0 +; CHECK-NEXT: kshiftlq $55, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $54, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $9, %k0, %k0 +; CHECK-NEXT: kshiftrq $9, %k0, %k0 +; CHECK-NEXT: kshiftlq $56, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $55, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $8, %k0, %k0 +; CHECK-NEXT: kshiftrq $8, %k0, %k0 +; CHECK-NEXT: kshiftlq $57, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $56, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $7, %k0, %k0 +; CHECK-NEXT: kshiftrq $7, %k0, %k0 +; CHECK-NEXT: kshiftlq $58, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $57, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $6, %k0, %k0 +; CHECK-NEXT: kshiftrq $6, %k0, %k0 +; CHECK-NEXT: kshiftlq $59, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $58, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $5, %k0, %k0 +; CHECK-NEXT: kshiftrq $5, %k0, %k0 +; CHECK-NEXT: kshiftlq $60, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $59, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $4, %k0, %k0 +; CHECK-NEXT: kshiftrq $4, %k0, %k0 +; CHECK-NEXT: kshiftlq $61, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $60, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $3, %k0, %k0 +; CHECK-NEXT: kshiftrq $3, %k0, %k0 +; CHECK-NEXT: kshiftlq $62, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $61, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kshiftlq $2, %k0, %k0 +; CHECK-NEXT: kshiftrq $2, %k0, %k0 +; CHECK-NEXT: kshiftlq $63, %k0, %k1 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; CHECK-NEXT: kshiftlq $62, %k2, %k2 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 +; CHECK-NEXT: kshiftlq $1, %k0, %k0 +; CHECK-NEXT: kshiftrq $1, %k0, %k0 +; CHECK-NEXT: kshiftlq $63, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kmovq %k0, (%rdi) ; CHECK-NEXT: retq ret <64 x i1> %x } @@ -1127,19 +1569,451 @@ define <64 x i1> @v64i1_argument_return(<64 x i1> %x) "min-legal-vector-width"=" define void @v64i1_shuffle(<64 x i8>* %x, <64 x i8>* %y) "min-legal-vector-width"="256" { ; CHECK-LABEL: v64i1_shuffle: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovdqa (%rdi), %ymm0 -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-NEXT: vmovdqa (%rdi), %ymm1 +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm0 ; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k0 +; CHECK-NEXT: kshiftrd $3, %k0, %k1 +; CHECK-NEXT: kshiftlq $2, %k0, %k2 +; CHECK-NEXT: kshiftlq $1, %k0, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $1, %k0, %k3 +; CHECK-NEXT: kshiftlq $63, %k3, %k3 +; CHECK-NEXT: kshiftrq $63, %k3, %k3 +; CHECK-NEXT: korq %k2, %k3, %k2 +; CHECK-NEXT: kshiftlq $3, %k0, %k3 +; CHECK-NEXT: kshiftlq $2, %k1, %k1 +; CHECK-NEXT: korq %k1, %k3, %k1 +; CHECK-NEXT: kshiftrd $2, %k0, %k3 +; CHECK-NEXT: kshiftlq $62, %k2, %k2 +; CHECK-NEXT: kshiftrq $62, %k2, %k2 +; CHECK-NEXT: korq %k1, %k2, %k1 +; CHECK-NEXT: kshiftlq $4, %k0, %k2 +; CHECK-NEXT: kshiftlq $3, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $5, %k0, %k3 +; CHECK-NEXT: kshiftlq $61, %k1, %k1 +; CHECK-NEXT: kshiftrq $61, %k1, %k1 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: kshiftlq $5, %k0, %k2 +; CHECK-NEXT: kshiftlq $4, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $4, %k0, %k3 +; CHECK-NEXT: kshiftlq $60, %k1, %k1 +; CHECK-NEXT: kshiftrq $60, %k1, %k1 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: kshiftlq $6, %k0, %k2 +; CHECK-NEXT: kshiftlq $5, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $7, %k0, %k3 +; CHECK-NEXT: kshiftlq $59, %k1, %k1 +; CHECK-NEXT: kshiftrq $59, %k1, %k1 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: kshiftlq $7, %k0, %k2 +; CHECK-NEXT: kshiftlq $6, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $6, %k0, %k3 +; CHECK-NEXT: kshiftlq $58, %k1, %k1 +; CHECK-NEXT: kshiftrq $58, %k1, %k1 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: kshiftlq $8, %k0, %k2 +; CHECK-NEXT: kshiftlq $7, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $9, %k0, %k3 +; CHECK-NEXT: kshiftlq $57, %k1, %k1 +; CHECK-NEXT: kshiftrq $57, %k1, %k1 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: kshiftlq $9, %k0, %k2 +; CHECK-NEXT: kshiftlq $8, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $8, %k0, %k3 +; CHECK-NEXT: kshiftlq $56, %k1, %k1 +; CHECK-NEXT: kshiftrq $56, %k1, %k1 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: kshiftlq $10, %k0, %k2 +; CHECK-NEXT: kshiftlq $9, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $11, %k0, %k3 +; CHECK-NEXT: kshiftlq $55, %k1, %k1 +; CHECK-NEXT: kshiftrq $55, %k1, %k1 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: kshiftlq $11, %k0, %k2 +; CHECK-NEXT: kshiftlq $10, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $10, %k0, %k3 +; CHECK-NEXT: kshiftlq $54, %k1, %k1 +; CHECK-NEXT: kshiftrq $54, %k1, %k1 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: kshiftlq $12, %k0, %k2 +; CHECK-NEXT: kshiftlq $11, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $13, %k0, %k3 +; CHECK-NEXT: kshiftlq $53, %k1, %k1 +; CHECK-NEXT: kshiftrq $53, %k1, %k1 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: kshiftlq $13, %k0, %k2 +; CHECK-NEXT: kshiftlq $12, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $12, %k0, %k3 +; CHECK-NEXT: kshiftlq $52, %k1, %k1 +; CHECK-NEXT: kshiftrq $52, %k1, %k1 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: kshiftlq $14, %k0, %k2 +; CHECK-NEXT: kshiftlq $13, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $15, %k0, %k3 +; CHECK-NEXT: kshiftlq $51, %k1, %k1 +; CHECK-NEXT: kshiftrq $51, %k1, %k1 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: kshiftlq $15, %k0, %k2 +; CHECK-NEXT: kshiftlq $14, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $14, %k0, %k3 +; CHECK-NEXT: kshiftlq $50, %k1, %k1 +; CHECK-NEXT: kshiftrq $50, %k1, %k1 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: kshiftlq $16, %k0, %k2 +; CHECK-NEXT: kshiftlq $15, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $17, %k0, %k3 +; CHECK-NEXT: kshiftlq $49, %k1, %k1 +; CHECK-NEXT: kshiftrq $49, %k1, %k1 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: kshiftlq $17, %k0, %k2 +; CHECK-NEXT: kshiftlq $16, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $16, %k0, %k3 +; CHECK-NEXT: kshiftlq $48, %k1, %k1 +; CHECK-NEXT: kshiftrq $48, %k1, %k1 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: kshiftlq $18, %k0, %k2 +; CHECK-NEXT: kshiftlq $17, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $19, %k0, %k3 +; CHECK-NEXT: kshiftlq $47, %k1, %k1 +; CHECK-NEXT: kshiftrq $47, %k1, %k1 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: kshiftlq $19, %k0, %k2 +; CHECK-NEXT: kshiftlq $18, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $18, %k0, %k3 +; CHECK-NEXT: kshiftlq $46, %k1, %k1 +; CHECK-NEXT: kshiftrq $46, %k1, %k1 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: kshiftlq $20, %k0, %k2 +; CHECK-NEXT: kshiftlq $19, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $21, %k0, %k3 +; CHECK-NEXT: kshiftlq $45, %k1, %k1 +; CHECK-NEXT: kshiftrq $45, %k1, %k1 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: kshiftlq $21, %k0, %k2 +; CHECK-NEXT: kshiftlq $20, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $20, %k0, %k3 +; CHECK-NEXT: kshiftlq $44, %k1, %k1 +; CHECK-NEXT: kshiftrq $44, %k1, %k1 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: kshiftlq $22, %k0, %k2 +; CHECK-NEXT: kshiftlq $21, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $23, %k0, %k3 +; CHECK-NEXT: kshiftlq $43, %k1, %k1 +; CHECK-NEXT: kshiftrq $43, %k1, %k1 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: kshiftlq $23, %k0, %k2 +; CHECK-NEXT: kshiftlq $22, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $22, %k0, %k3 +; CHECK-NEXT: kshiftlq $42, %k1, %k1 +; CHECK-NEXT: kshiftrq $42, %k1, %k1 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: kshiftlq $24, %k0, %k2 +; CHECK-NEXT: kshiftlq $23, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $25, %k0, %k3 +; CHECK-NEXT: kshiftlq $41, %k1, %k1 +; CHECK-NEXT: kshiftrq $41, %k1, %k1 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: kshiftlq $25, %k0, %k2 +; CHECK-NEXT: kshiftlq $24, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $24, %k0, %k3 +; CHECK-NEXT: kshiftlq $40, %k1, %k1 +; CHECK-NEXT: kshiftrq $40, %k1, %k1 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: kshiftlq $26, %k0, %k2 +; CHECK-NEXT: kshiftlq $25, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $27, %k0, %k3 +; CHECK-NEXT: kshiftlq $39, %k1, %k1 +; CHECK-NEXT: kshiftrq $39, %k1, %k1 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: kshiftlq $27, %k0, %k2 +; CHECK-NEXT: kshiftlq $26, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $26, %k0, %k3 +; CHECK-NEXT: kshiftlq $38, %k1, %k1 +; CHECK-NEXT: kshiftrq $38, %k1, %k1 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: kshiftlq $28, %k0, %k2 +; CHECK-NEXT: kshiftlq $27, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $29, %k0, %k3 +; CHECK-NEXT: kshiftlq $37, %k1, %k1 +; CHECK-NEXT: kshiftrq $37, %k1, %k1 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: kshiftlq $29, %k0, %k2 +; CHECK-NEXT: kshiftlq $28, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $28, %k0, %k3 +; CHECK-NEXT: kshiftlq $36, %k1, %k1 +; CHECK-NEXT: kshiftrq $36, %k1, %k1 +; CHECK-NEXT: korq %k2, %k1, %k1 +; CHECK-NEXT: kshiftlq $30, %k0, %k2 +; CHECK-NEXT: kshiftlq $29, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $31, %k0, %k3 +; CHECK-NEXT: kshiftlq $35, %k1, %k1 +; CHECK-NEXT: kshiftrq $35, %k1, %k1 +; CHECK-NEXT: korq %k2, %k1, %k2 +; CHECK-NEXT: kshiftlq $31, %k0, %k1 +; CHECK-NEXT: kshiftlq $30, %k3, %k3 +; CHECK-NEXT: korq %k3, %k1, %k3 ; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpmovm2b %k1, %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] -; CHECK-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; CHECK-NEXT: vpmovb2m %ymm2, %k1 -; CHECK-NEXT: vpmovm2b %k0, %ymm2 -; CHECK-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; CHECK-NEXT: vpmovb2m %ymm2, %k2 -; CHECK-NEXT: vmovdqu8 %ymm1, 32(%rsi) {%k2} -; CHECK-NEXT: vmovdqu8 %ymm0, (%rsi) {%k1} +; CHECK-NEXT: kshiftrd $30, %k0, %k0 +; CHECK-NEXT: kshiftlq $34, %k2, %k2 +; CHECK-NEXT: kshiftrq $34, %k2, %k2 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftlq $32, %k0, %k3 +; CHECK-NEXT: kshiftlq $31, %k0, %k0 +; CHECK-NEXT: korq %k0, %k3, %k0 +; CHECK-NEXT: kshiftrd $1, %k1, %k3 +; CHECK-NEXT: kshiftlq $33, %k2, %k2 +; CHECK-NEXT: kshiftrq $33, %k2, %k2 +; CHECK-NEXT: korq %k0, %k2, %k0 +; CHECK-NEXT: kshiftlq $32, %k0, %k0 +; CHECK-NEXT: kshiftrq $32, %k0, %k0 +; CHECK-NEXT: kshiftlq $33, %k0, %k2 +; CHECK-NEXT: kshiftlq $32, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: kshiftlq $34, %k0, %k2 +; CHECK-NEXT: kshiftlq $33, %k1, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $3, %k1, %k3 +; CHECK-NEXT: kshiftlq $31, %k0, %k0 +; CHECK-NEXT: kshiftrq $31, %k0, %k0 +; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: kshiftlq $35, %k0, %k2 +; CHECK-NEXT: kshiftlq $34, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $2, %k1, %k3 +; CHECK-NEXT: kshiftlq $30, %k0, %k0 +; CHECK-NEXT: kshiftrq $30, %k0, %k0 +; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: kshiftlq $36, %k0, %k2 +; CHECK-NEXT: kshiftlq $35, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $5, %k1, %k3 +; CHECK-NEXT: kshiftlq $29, %k0, %k0 +; CHECK-NEXT: kshiftrq $29, %k0, %k0 +; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: kshiftlq $37, %k0, %k2 +; CHECK-NEXT: kshiftlq $36, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $4, %k1, %k3 +; CHECK-NEXT: kshiftlq $28, %k0, %k0 +; CHECK-NEXT: kshiftrq $28, %k0, %k0 +; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: kshiftlq $38, %k0, %k2 +; CHECK-NEXT: kshiftlq $37, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $7, %k1, %k3 +; CHECK-NEXT: kshiftlq $27, %k0, %k0 +; CHECK-NEXT: kshiftrq $27, %k0, %k0 +; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: kshiftlq $39, %k0, %k2 +; CHECK-NEXT: kshiftlq $38, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $6, %k1, %k3 +; CHECK-NEXT: kshiftlq $26, %k0, %k0 +; CHECK-NEXT: kshiftrq $26, %k0, %k0 +; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: kshiftlq $40, %k0, %k2 +; CHECK-NEXT: kshiftlq $39, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $9, %k1, %k3 +; CHECK-NEXT: kshiftlq $25, %k0, %k0 +; CHECK-NEXT: kshiftrq $25, %k0, %k0 +; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: kshiftlq $41, %k0, %k2 +; CHECK-NEXT: kshiftlq $40, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $8, %k1, %k3 +; CHECK-NEXT: kshiftlq $24, %k0, %k0 +; CHECK-NEXT: kshiftrq $24, %k0, %k0 +; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: kshiftlq $42, %k0, %k2 +; CHECK-NEXT: kshiftlq $41, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $11, %k1, %k3 +; CHECK-NEXT: kshiftlq $23, %k0, %k0 +; CHECK-NEXT: kshiftrq $23, %k0, %k0 +; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: kshiftlq $43, %k0, %k2 +; CHECK-NEXT: kshiftlq $42, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $10, %k1, %k3 +; CHECK-NEXT: kshiftlq $22, %k0, %k0 +; CHECK-NEXT: kshiftrq $22, %k0, %k0 +; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: kshiftlq $44, %k0, %k2 +; CHECK-NEXT: kshiftlq $43, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $13, %k1, %k3 +; CHECK-NEXT: kshiftlq $21, %k0, %k0 +; CHECK-NEXT: kshiftrq $21, %k0, %k0 +; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: kshiftlq $45, %k0, %k2 +; CHECK-NEXT: kshiftlq $44, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $12, %k1, %k3 +; CHECK-NEXT: kshiftlq $20, %k0, %k0 +; CHECK-NEXT: kshiftrq $20, %k0, %k0 +; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: kshiftlq $46, %k0, %k2 +; CHECK-NEXT: kshiftlq $45, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $15, %k1, %k3 +; CHECK-NEXT: kshiftlq $19, %k0, %k0 +; CHECK-NEXT: kshiftrq $19, %k0, %k0 +; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: kshiftlq $47, %k0, %k2 +; CHECK-NEXT: kshiftlq $46, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $14, %k1, %k3 +; CHECK-NEXT: kshiftlq $18, %k0, %k0 +; CHECK-NEXT: kshiftrq $18, %k0, %k0 +; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: kshiftlq $48, %k0, %k2 +; CHECK-NEXT: kshiftlq $47, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $17, %k1, %k3 +; CHECK-NEXT: kshiftlq $17, %k0, %k0 +; CHECK-NEXT: kshiftrq $17, %k0, %k0 +; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: kshiftlq $49, %k0, %k2 +; CHECK-NEXT: kshiftlq $48, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $16, %k1, %k3 +; CHECK-NEXT: kshiftlq $16, %k0, %k0 +; CHECK-NEXT: kshiftrq $16, %k0, %k0 +; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: kshiftlq $50, %k0, %k2 +; CHECK-NEXT: kshiftlq $49, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $19, %k1, %k3 +; CHECK-NEXT: kshiftlq $15, %k0, %k0 +; CHECK-NEXT: kshiftrq $15, %k0, %k0 +; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: kshiftlq $51, %k0, %k2 +; CHECK-NEXT: kshiftlq $50, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $18, %k1, %k3 +; CHECK-NEXT: kshiftlq $14, %k0, %k0 +; CHECK-NEXT: kshiftrq $14, %k0, %k0 +; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: kshiftlq $52, %k0, %k2 +; CHECK-NEXT: kshiftlq $51, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $21, %k1, %k3 +; CHECK-NEXT: kshiftlq $13, %k0, %k0 +; CHECK-NEXT: kshiftrq $13, %k0, %k0 +; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: kshiftlq $53, %k0, %k2 +; CHECK-NEXT: kshiftlq $52, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $20, %k1, %k3 +; CHECK-NEXT: kshiftlq $12, %k0, %k0 +; CHECK-NEXT: kshiftrq $12, %k0, %k0 +; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: kshiftlq $54, %k0, %k2 +; CHECK-NEXT: kshiftlq $53, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $23, %k1, %k3 +; CHECK-NEXT: kshiftlq $11, %k0, %k0 +; CHECK-NEXT: kshiftrq $11, %k0, %k0 +; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: kshiftlq $55, %k0, %k2 +; CHECK-NEXT: kshiftlq $54, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $22, %k1, %k3 +; CHECK-NEXT: kshiftlq $10, %k0, %k0 +; CHECK-NEXT: kshiftrq $10, %k0, %k0 +; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: kshiftlq $56, %k0, %k2 +; CHECK-NEXT: kshiftlq $55, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $25, %k1, %k3 +; CHECK-NEXT: kshiftlq $9, %k0, %k0 +; CHECK-NEXT: kshiftrq $9, %k0, %k0 +; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: kshiftlq $57, %k0, %k2 +; CHECK-NEXT: kshiftlq $56, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $24, %k1, %k3 +; CHECK-NEXT: kshiftlq $8, %k0, %k0 +; CHECK-NEXT: kshiftrq $8, %k0, %k0 +; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: kshiftlq $58, %k0, %k2 +; CHECK-NEXT: kshiftlq $57, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $27, %k1, %k3 +; CHECK-NEXT: kshiftlq $7, %k0, %k0 +; CHECK-NEXT: kshiftrq $7, %k0, %k0 +; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: kshiftlq $59, %k0, %k2 +; CHECK-NEXT: kshiftlq $58, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $26, %k1, %k3 +; CHECK-NEXT: kshiftlq $6, %k0, %k0 +; CHECK-NEXT: kshiftrq $6, %k0, %k0 +; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: kshiftlq $60, %k0, %k2 +; CHECK-NEXT: kshiftlq $59, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $29, %k1, %k3 +; CHECK-NEXT: kshiftlq $5, %k0, %k0 +; CHECK-NEXT: kshiftrq $5, %k0, %k0 +; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: kshiftlq $61, %k0, %k2 +; CHECK-NEXT: kshiftlq $60, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $28, %k1, %k3 +; CHECK-NEXT: kshiftlq $4, %k0, %k0 +; CHECK-NEXT: kshiftrq $4, %k0, %k0 +; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: kshiftlq $62, %k0, %k2 +; CHECK-NEXT: kshiftlq $61, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftrd $31, %k1, %k3 +; CHECK-NEXT: kshiftlq $3, %k0, %k0 +; CHECK-NEXT: kshiftrq $3, %k0, %k0 +; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: kshiftlq $63, %k0, %k2 +; CHECK-NEXT: kshiftlq $62, %k3, %k3 +; CHECK-NEXT: korq %k3, %k2, %k2 +; CHECK-NEXT: kshiftlq $2, %k0, %k0 +; CHECK-NEXT: kshiftrq $2, %k0, %k0 +; CHECK-NEXT: korq %k2, %k0, %k0 +; CHECK-NEXT: kshiftrd $30, %k1, %k1 +; CHECK-NEXT: kshiftlq $1, %k0, %k0 +; CHECK-NEXT: kshiftrq $1, %k0, %k0 +; CHECK-NEXT: kshiftlq $63, %k1, %k1 +; CHECK-NEXT: korq %k1, %k0, %k1 +; CHECK-NEXT: vmovdqu8 %ymm1, (%rsi) {%k1} +; CHECK-NEXT: kshiftrq $32, %k1, %k1 +; CHECK-NEXT: vmovdqu8 %ymm0, 32(%rsi) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: @@ -1151,3 +2025,21 @@ entry: } declare void @llvm.masked.store.v64i8.p0v64i8(<64 x i8>, <64 x i8>*, i32, <64 x i1>) +@mem64_dst = global i64 0, align 8 +@mem64_src = global i64 0, align 8 +define i32 @v64i1_inline_asm() "min-legal-vector-width"="256" { +; CHECK-LABEL: v64i1_inline_asm: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovq {{.*}}(%rip), %k0 +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: kmovq %k0, {{.*}}(%rip) +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq + %1 = alloca i32, align 4 + %2 = load i64, i64* @mem64_src, align 8 + %3 = call i64 asm "", "=k,k,~{dirflag},~{fpsr},~{flags}"(i64 %2) + store i64 %3, i64* @mem64_dst, align 8 + %4 = load i32, i32* %1, align 4 + ret i32 %4 +} |