diff options
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 69 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrAVX512.td | 29 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512-mask-op.ll | 8 |
3 files changed, 68 insertions, 38 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 18f32768a06..e8b3f3656b6 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1177,6 +1177,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom); + // There is no byte sized k-register load or store without AVX512DQ. + if (!Subtarget.hasDQI()) { + setOperationAction(ISD::LOAD, MVT::v1i1, Custom); + setOperationAction(ISD::LOAD, MVT::v2i1, Custom); + setOperationAction(ISD::LOAD, MVT::v4i1, Custom); + setOperationAction(ISD::LOAD, MVT::v8i1, Custom); + + setOperationAction(ISD::STORE, MVT::v1i1, Custom); + setOperationAction(ISD::STORE, MVT::v2i1, Custom); + setOperationAction(ISD::STORE, MVT::v4i1, Custom); + setOperationAction(ISD::STORE, MVT::v8i1, Custom); + } + // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors. for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SIGN_EXTEND, VT, Custom); @@ -18983,6 +18996,30 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } +static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + StoreSDNode *St = cast<StoreSDNode>(Op.getNode()); + EVT VT = St->getValue().getValueType(); + SDLoc dl(St); + SDValue StoredVal = St->getOperand(1); + + // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads. + assert(VT.isVector() && VT.getVectorElementType() == MVT::i1 && + VT.getVectorNumElements() <= 8 && "Unexpected VT"); + assert(!St->isTruncatingStore() && "Expected non-truncating store"); + assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() && + "Expected AVX512F without AVX512DQI"); + + StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1, + DAG.getUNDEF(MVT::v8i1), StoredVal, + DAG.getIntPtrConstant(0, dl)); + StoredVal = DAG.getBitcast(MVT::i8, StoredVal); + + return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), + St->getPointerInfo(), St->getAlignment(), + St->getMemOperand()->getFlags()); +} + // Lower vector extended loads using a shuffle. If SSSE3 is not available we // may emit an illegal shuffle but the expansion is still better than scalar // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise @@ -18990,20 +19027,41 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, // FIXME: Is the expansion actually better than scalar code? It doesn't seem so. // TODO: It is possible to support ZExt by zeroing the undef values during // the shuffle phase or after the shuffle. -static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget, +static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT RegVT = Op.getSimpleValueType(); assert(RegVT.isVector() && "We only custom lower vector sext loads."); assert(RegVT.isInteger() && "We only custom lower integer vector sext loads."); - // Nothing useful we can do without SSE2 shuffles. - assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2."); - LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode()); SDLoc dl(Ld); EVT MemVT = Ld->getMemoryVT(); + // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads. + if (RegVT.isVector() && RegVT.getVectorElementType() == MVT::i1) { + assert(EVT(RegVT) == MemVT && "Expected non-extending load"); + assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT"); + assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() && + "Expected AVX512F without AVX512DQI"); + + SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), Ld->getAlignment(), + Ld->getMemOperand()->getFlags()); + + // Replace chain users with the new chain. + assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!"); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewLd.getValue(1)); + + SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT, + DAG.getBitcast(MVT::v8i1, NewLd), + DAG.getIntPtrConstant(0, dl)); + return DAG.getMergeValues({Extract, NewLd.getValue(1)}, dl); + } + + // Nothing useful we can do without SSE2 shuffles. + assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2."); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned RegSz = RegVT.getSizeInBits(); @@ -24766,7 +24824,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); - case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG); + case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG); + case ISD::STORE: return LowerStore(Op, Subtarget, DAG); case ISD::FABS: case ISD::FNEG: return LowerFABSorFNEG(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 91e4aca1489..59ed5fd6f31 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -2816,40 +2816,15 @@ let Predicates = [HasDQI] in { def : Pat<(store VK1:$src, addr:$dst), (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK1:$src, VK8))>; + def : Pat<(v1i1 (load addr:$src)), + (COPY_TO_REGCLASS (KMOVBkm addr:$src), VK1)>; def : Pat<(v2i1 (load addr:$src)), (COPY_TO_REGCLASS (KMOVBkm addr:$src), VK2)>; def : Pat<(v4i1 (load addr:$src)), (COPY_TO_REGCLASS (KMOVBkm addr:$src), VK4)>; } -let Predicates = [HasAVX512, NoDQI] in { - def : Pat<(store VK1:$src, addr:$dst), - (MOV8mr addr:$dst, - (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), - sub_8bit)))>; - def : Pat<(store VK2:$src, addr:$dst), - (MOV8mr addr:$dst, - (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK2:$src, GR32)), - sub_8bit)))>; - def : Pat<(store VK4:$src, addr:$dst), - (MOV8mr addr:$dst, - (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK4:$src, GR32)), - sub_8bit)))>; - def : Pat<(store VK8:$src, addr:$dst), - (MOV8mr addr:$dst, - (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK8:$src, GR32)), - sub_8bit)))>; - - def : Pat<(v8i1 (load addr:$src)), - (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK8)>; - def : Pat<(v2i1 (load addr:$src)), - (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK2)>; - def : Pat<(v4i1 (load addr:$src)), - (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK4)>; -} let Predicates = [HasAVX512] in { - def : Pat<(v1i1 (load addr:$src)), - (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK1)>; def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))), (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK8)>; } diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll index 88d5ed4f7ad..1038c90ab9d 100644 --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -348,9 +348,7 @@ define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) { define i8 @conv1(<8 x i1>* %R) { ; KNL-LABEL: conv1: ; KNL: ## %bb.0: ## %entry -; KNL-NEXT: kxnorw %k0, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: movb %al, (%rdi) +; KNL-NEXT: movb $-1, (%rdi) ; KNL-NEXT: movb $-2, -{{[0-9]+}}(%rsp) ; KNL-NEXT: movb $-2, %al ; KNL-NEXT: retq @@ -365,9 +363,7 @@ define i8 @conv1(<8 x i1>* %R) { ; ; AVX512BW-LABEL: conv1: ; AVX512BW: ## %bb.0: ## %entry -; AVX512BW-NEXT: kxnorw %k0, %k0, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, (%rdi) +; AVX512BW-NEXT: movb $-1, (%rdi) ; AVX512BW-NEXT: movb $-2, -{{[0-9]+}}(%rsp) ; AVX512BW-NEXT: movb $-2, %al ; AVX512BW-NEXT: retq |

