diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 100 |
1 files changed, 68 insertions, 32 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index bddfb190a3b..6146c49a0f1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -417,8 +417,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom); if (Subtarget->hasFFBL()) - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Legal); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom); + setOperationAction(ISD::CTTZ, MVT::i64, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom); setOperationAction(ISD::CTLZ, MVT::i64, Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); @@ -1113,9 +1115,11 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG); case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); + case ISD::CTTZ: + case ISD::CTTZ_ZERO_UNDEF: case ISD::CTLZ: case ISD::CTLZ_ZERO_UNDEF: - return LowerCTLZ(Op, DAG); + return LowerCTLZ_CTTZ(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); } return Op; @@ -2154,13 +2158,33 @@ SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); } -SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { +static bool isCtlzOpc(unsigned Opc) { + return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF; +} + +static bool isCttzOpc(unsigned Opc) { + return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF; +} + +SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Src = Op.getOperand(0); - bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF; + bool ZeroUndef = Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF || + Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF; + + unsigned ISDOpc, NewOpc; + if (isCtlzOpc(Op.getOpcode())) { + ISDOpc = ISD::CTLZ_ZERO_UNDEF; + NewOpc = AMDGPUISD::FFBH_U32; + } else if (isCttzOpc(Op.getOpcode())) { + ISDOpc = ISD::CTTZ_ZERO_UNDEF; + NewOpc = AMDGPUISD::FFBL_B32; + } else + llvm_unreachable("Unexpected OPCode!!!"); + if (ZeroUndef && Src.getValueType() == MVT::i32) - return DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Src); + return DAG.getNode(NewOpc, SL, MVT::i32, Src); SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); @@ -2173,24 +2197,33 @@ SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32); - SDValue Hi0 = DAG.getSetCC(SL, SetCCVT, Hi, Zero, ISD::SETEQ); + SDValue ZeroOrOne = isCtlzOpc(Op.getOpcode()) ? Zero : One; + SDValue HiOrLo = isCtlzOpc(Op.getOpcode()) ? Hi : Lo; + SDValue Hi0orLo0 = DAG.getSetCC(SL, SetCCVT, HiOrLo, ZeroOrOne, ISD::SETEQ); - SDValue CtlzLo = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Lo); - SDValue CtlzHi = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Hi); + SDValue OprLo = DAG.getNode(ISDOpc, SL, MVT::i32, Lo); + SDValue OprHi = DAG.getNode(ISDOpc, SL, MVT::i32, Hi); const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32); - SDValue Add = DAG.getNode(ISD::ADD, SL, MVT::i32, CtlzLo, Bits32); - - // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x)) - SDValue NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0, Add, CtlzHi); + SDValue Add, NewOpr; + if (isCtlzOpc(Op.getOpcode())) { + Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprLo, Bits32); + // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x)) + NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprHi); + } else { + Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprHi, Bits32); + // cttz(x) = lo_32(x) == 0 ? cttz(hi_32(x)) + 32 : cttz(lo_32(x)) + NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprLo); + } if (!ZeroUndef) { // Test if the full 64-bit input is zero. // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32, // which we probably don't want. - SDValue Lo0 = DAG.getSetCC(SL, SetCCVT, Lo, Zero, ISD::SETEQ); - SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0, Hi0); + SDValue LoOrHi = isCtlzOpc(Op.getOpcode()) ? Lo : Hi; + SDValue Lo0OrHi0 = DAG.getSetCC(SL, SetCCVT, LoOrHi, ZeroOrOne, ISD::SETEQ); + SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0OrHi0, Hi0orLo0); // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction // with the same cycles, otherwise it is slower. @@ -2201,11 +2234,11 @@ SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { // The instruction returns -1 for 0 input, but the defined intrinsic // behavior is to return the number of bits. - NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, - SrcIsZero, Bits32, NewCtlz); + NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, + SrcIsZero, Bits32, NewOpr); } - return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewCtlz); + return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr); } SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, @@ -3117,13 +3150,10 @@ static bool isNegativeOne(SDValue Val) { return false; } -static bool isCtlzOpc(unsigned Opc) { - return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF; -} - -SDValue AMDGPUTargetLowering::getFFBH_U32(SelectionDAG &DAG, +SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG, SDValue Op, - const SDLoc &DL) const { + const SDLoc &DL, + unsigned Opc) const { EVT VT = Op.getValueType(); EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT); if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() && @@ -3133,11 +3163,11 @@ SDValue AMDGPUTargetLowering::getFFBH_U32(SelectionDAG &DAG, if (VT != MVT::i32) Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op); - SDValue FFBH = DAG.getNode(AMDGPUISD::FFBH_U32, DL, MVT::i32, Op); + SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op); if (VT != MVT::i32) - FFBH = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBH); + FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX); - return FFBH; + return FFBX; } // The native instructions return -1 on 0 input. Optimize out a select that @@ -3147,7 +3177,7 @@ SDValue AMDGPUTargetLowering::getFFBH_U32(SelectionDAG &DAG, // against the bitwidth. // // TODO: Should probably combine against FFBH_U32 instead of ctlz directly. -SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond, +SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const { ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); @@ -3158,20 +3188,25 @@ SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond, ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); SDValue CmpLHS = Cond.getOperand(0); + unsigned Opc = isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : + AMDGPUISD::FFBH_U32; + // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x + // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x if (CCOpcode == ISD::SETEQ && - isCtlzOpc(RHS.getOpcode()) && + (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) && RHS.getOperand(0) == CmpLHS && isNegativeOne(LHS)) { - return getFFBH_U32(DAG, CmpLHS, SL); + return getFFBX_U32(DAG, CmpLHS, SL, Opc); } // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x + // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x if (CCOpcode == ISD::SETNE && - isCtlzOpc(LHS.getOpcode()) && + (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) && LHS.getOperand(0) == CmpLHS && isNegativeOne(RHS)) { - return getFFBH_U32(DAG, CmpLHS, SL); + return getFFBX_U32(DAG, CmpLHS, SL, Opc); } return SDValue(); @@ -3304,7 +3339,7 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, } // There's no reason to not do this if the condition has other uses. - return performCtlzCombine(SDLoc(N), Cond, True, False, DCI); + return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI); } static bool isConstantFPZero(SDValue N) { @@ -3892,6 +3927,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BFM) NODE_NAME_CASE(FFBH_U32) NODE_NAME_CASE(FFBH_I32) + NODE_NAME_CASE(FFBL_B32) NODE_NAME_CASE(MUL_U24) NODE_NAME_CASE(MUL_I24) NODE_NAME_CASE(MULHI_U24) |