diff options
| author | Igor Breger <igor.breger@intel.com> | 2015-10-18 09:56:39 +0000 |
|---|---|---|
| committer | Igor Breger <igor.breger@intel.com> | 2015-10-18 09:56:39 +0000 |
| commit | cbb9550537b6c8bcd0ab86b11eeca67b4876eb46 (patch) | |
| tree | ac9881ee515a194ab50324d45e3afc5168f60027 /llvm/lib/Target | |
| parent | 265309e38a99a74d872a818b1c49085165e4e1ac (diff) | |
| download | bcm5719-llvm-cbb9550537b6c8bcd0ab86b11eeca67b4876eb46.tar.gz bcm5719-llvm-cbb9550537b6c8bcd0ab86b11eeca67b4876eb46.zip | |
AVX512: Lowering i8/i16 vector CTLZ using the dword LZCNT vector instruction
Differential Revision: http://reviews.llvm.org/D13632
llvm-svn: 250649
Diffstat (limited to 'llvm/lib/Target')
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 142 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrAVX512.td | 4 |
2 files changed, 123 insertions, 23 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 446098a0711..b1bafba047c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1509,29 +1509,49 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::XOR, MVT::v16i32, Legal); if (Subtarget->hasCDI()) { - setOperationAction(ISD::CTLZ, MVT::v8i64, Legal); + setOperationAction(ISD::CTLZ, MVT::v8i64, Legal); setOperationAction(ISD::CTLZ, MVT::v16i32, Legal); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i64, Legal); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i64, Legal); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i32, Legal); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i64, Custom); + setOperationAction(ISD::CTLZ, MVT::v8i16, Custom); + setOperationAction(ISD::CTLZ, MVT::v16i8, Custom); + setOperationAction(ISD::CTLZ, MVT::v16i16, Custom); + setOperationAction(ISD::CTLZ, MVT::v32i8, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i16, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i8, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i16, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i8, Custom); + + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i64, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i32, Custom); - } - if (Subtarget->hasVLX() && Subtarget->hasCDI()) { - setOperationAction(ISD::CTLZ, MVT::v4i64, Legal); - setOperationAction(ISD::CTLZ, MVT::v8i32, Legal); - setOperationAction(ISD::CTLZ, MVT::v2i64, Legal); - setOperationAction(ISD::CTLZ, MVT::v4i32, Legal); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Legal); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Legal); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Legal); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Legal); - - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); - } + + if (Subtarget->hasVLX()) { + setOperationAction(ISD::CTLZ, MVT::v4i64, Legal); + setOperationAction(ISD::CTLZ, MVT::v8i32, Legal); + setOperationAction(ISD::CTLZ, MVT::v2i64, Legal); + setOperationAction(ISD::CTLZ, MVT::v4i32, Legal); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Legal); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Legal); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Legal); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Legal); + + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); + } else { + setOperationAction(ISD::CTLZ, MVT::v4i64, Custom); + setOperationAction(ISD::CTLZ, MVT::v8i32, Custom); + setOperationAction(ISD::CTLZ, MVT::v2i64, Custom); + setOperationAction(ISD::CTLZ, MVT::v4i32, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Custom); + } + } // Subtarget->hasCDI() + if (Subtarget->hasDQI()) { setOperationAction(ISD::MUL, MVT::v2i64, Legal); setOperationAction(ISD::MUL, MVT::v4i64, Legal); @@ -1605,6 +1625,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MULHU, MVT::v32i16, Legal); setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Legal); setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Custom); @@ -1646,6 +1668,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget->hasVLX()) setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); + if (Subtarget->hasCDI()) { + setOperationAction(ISD::CTLZ, MVT::v32i16, Custom); + setOperationAction(ISD::CTLZ, MVT::v64i8, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i16, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v64i8, Custom); + } + for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { const MVT VT = (MVT::SimpleValueType)i; @@ -17501,12 +17530,75 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); } -static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) { +/// \brief Lower a vector CTLZ using native supported vector CTLZ instruction. +// +// 1. i32/i64 128/256-bit vector (native support require VLX) are expended +// to 512-bit vector. +// 2. i8/i16 vector implemented using dword LZCNT vector instruction +// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal, +// split the vector, perform operation on it's Lo a Hi part and +// concatenate the results. +static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) { + SDLoc dl(Op); + MVT VT = Op.getSimpleValueType(); + MVT EltVT = VT.getVectorElementType(); + unsigned NumElems = VT.getVectorNumElements(); + + if (EltVT == MVT::i64 || EltVT == MVT::i32) { + // Extend to 512 bit vector. + assert((VT.is256BitVector() || VT.is128BitVector()) && + "Unsupported value type for operation"); + + MVT NewVT = MVT::getVectorVT(EltVT, 512 / VT.getScalarSizeInBits()); + SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT, + DAG.getUNDEF(NewVT), + Op.getOperand(0), + DAG.getIntPtrConstant(0, dl)); + SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Vec512); + + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CtlzNode, + DAG.getIntPtrConstant(0, dl)); + } + + assert((EltVT == MVT::i8 || EltVT == MVT::i16) && + "Unsupported element type"); + + if (16 < NumElems) { + // Split vector, it's Lo and Hi parts will be handled in next iteration. + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl); + MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2); + + Lo = DAG.getNode(Op.getOpcode(), dl, OutVT, Lo); + Hi = DAG.getNode(Op.getOpcode(), dl, OutVT, Hi); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); + } + + MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems); + + assert((NewVT.is256BitVector() || NewVT.is512BitVector()) && + "Unsupported value type for operation"); + + // Use native supported vector instruction vplzcntd. + Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0)); + SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op); + SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode); + SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT); + + return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta); +} + +static SDValue LowerCTLZ(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); EVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); SDLoc dl(Op); + if (VT.isVector() && Subtarget->hasAVX512()) + return LowerVectorCTLZ_AVX512(Op, DAG); + Op = Op.getOperand(0); if (VT == MVT::i8) { // Zero extend to i32 since there is not an i8 bsr. @@ -17536,12 +17628,16 @@ static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) { return Op; } -static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); EVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); SDLoc dl(Op); + if (VT.isVector() && Subtarget->hasAVX512()) + return LowerVectorCTLZ_AVX512(Op, DAG); + Op = Op.getOperand(0); if (VT == MVT::i8) { // Zero extend to i32 since there is not an i8 bsr. @@ -19554,8 +19650,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); - case ISD::CTLZ: return LowerCTLZ(Op, DAG); - case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG); + case ISD::CTLZ: return LowerCTLZ(Op, Subtarget, DAG); + case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, Subtarget, DAG); case ISD::CTTZ: case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG); case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 8a22e2e842d..31a7e95df15 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -786,6 +786,10 @@ def : Pat<(insert_subvector undef, (v8i32 VR256X:$src), (iPTR 0)), (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; def : Pat<(insert_subvector undef, (v8f32 VR256X:$src), (iPTR 0)), (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; +def : Pat<(insert_subvector undef, (v16i16 VR256X:$src), (iPTR 0)), + (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; +def : Pat<(insert_subvector undef, (v32i8 VR256X:$src), (iPTR 0)), + (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; // vextractps - extract 32 bits from XMM def VEXTRACTPSzrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst), |

