diff options
| author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2016-01-11 17:02:00 +0000 |
|---|---|---|
| committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2016-01-11 17:02:00 +0000 |
| commit | de5fbe9c60d88308e5eabfe0360ba66863c09768 (patch) | |
| tree | e4be0b70177eefabcf10f6b58e0d631f3ab7d410 /llvm/lib/Target/AMDGPU | |
| parent | 94ad1b527f938452f496cf75289bd61cbdfa7875 (diff) | |
| download | bcm5719-llvm-de5fbe9c60d88308e5eabfe0360ba66863c09768.tar.gz bcm5719-llvm-de5fbe9c60d88308e5eabfe0360ba66863c09768.zip | |
AMDGPU: Pattern match ffbh pattern to instruction.
The hardware instruction's output on 0 is -1 rather than 32.
Eliminate a test and select to -1. This removes an extra instruction
from the compatability function with HSAIL's firstbit instruction.
llvm-svn: 257352
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 103 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 4 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/EvergreenInstructions.td | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 3 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstructions.td | 2 |
6 files changed, 92 insertions, 24 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 810b325b230..fafe58d65b1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -282,7 +282,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, setOperationAction(ISD::SMAX, MVT::i32, Legal); setOperationAction(ISD::UMAX, MVT::i32, Legal); - if (!Subtarget->hasFFBH()) + if (Subtarget->hasFFBH()) + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom); + else setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); if (!Subtarget->hasFFBL()) @@ -2170,9 +2172,11 @@ SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Src = Op.getOperand(0); - assert(Src.getValueType() == MVT::i64); - bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF; + + if (ZeroUndef && Src.getValueType() == MVT::i32) + return DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Src); + SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); @@ -2507,6 +2511,79 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, return DAG.getSExtOrTrunc(Mul, DL, VT); } +static bool isNegativeOne(SDValue Val) { + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) + return C->isAllOnesValue(); + return false; +} + +static bool isCtlzOpc(unsigned Opc) { + return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF; +} + +// The native instructions return -1 on 0 input. Optimize out a select that +// produces -1 on 0. +// +// TODO: If zero is not undef, we could also do this if the output is compared +// against the bitwidth. +// +// TODO: Should probably combine against FFBH_U32 instead of ctlz directly. +SDValue AMDGPUTargetLowering::performCtlzCombine(SDLoc SL, + SDValue Cond, + SDValue LHS, + SDValue RHS, + DAGCombinerInfo &DCI) const { + ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); + if (!CmpRhs || !CmpRhs->isNullValue()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + SDValue CmpLHS = Cond.getOperand(0); + + // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x + if (CCOpcode == ISD::SETEQ && + isCtlzOpc(RHS.getOpcode()) && + RHS.getOperand(0) == CmpLHS && + isNegativeOne(LHS)) { + return DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, CmpLHS); + } + + // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x + if (CCOpcode == ISD::SETNE && + isCtlzOpc(LHS.getOpcode()) && + LHS.getOperand(0) == CmpLHS && + isNegativeOne(RHS)) { + return DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, CmpLHS); + } + + return SDValue(); +} + +SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SDValue Cond = N->getOperand(0); + if (Cond.getOpcode() != ISD::SETCC) + return SDValue(); + + EVT VT = N->getValueType(0); + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + SDValue CC = Cond.getOperand(2); + + SDValue True = N->getOperand(1); + SDValue False = N->getOperand(2); + + if (VT == MVT::f32 && Cond.hasOneUse()) + return CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI); + + // There's no reason to not do this if the condition has other uses. + if (VT == MVT::i32) + return performCtlzCombine(SDLoc(N), Cond, True, False, DCI); + + return SDValue(); +} + SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -2531,23 +2608,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, simplifyI24(N1, DCI); return SDValue(); } - case ISD::SELECT: { - SDValue Cond = N->getOperand(0); - if (Cond.getOpcode() == ISD::SETCC && Cond.hasOneUse()) { - EVT VT = N->getValueType(0); - SDValue LHS = Cond.getOperand(0); - SDValue RHS = Cond.getOperand(1); - SDValue CC = Cond.getOperand(2); - - SDValue True = N->getOperand(1); - SDValue False = N->getOperand(2); - - if (VT == MVT::f32) - return CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI); - } - - break; - } + case ISD::SELECT: + return performSelectCombine(N, DCI); case AMDGPUISD::BFE_I32: case AMDGPUISD::BFE_U32: { assert(!N->getValueType(0).isVector() && @@ -2759,6 +2821,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BFE_I32) NODE_NAME_CASE(BFI) NODE_NAME_CASE(BFM) + NODE_NAME_CASE(FFBH_U32) NODE_NAME_CASE(MUL_U24) NODE_NAME_CASE(MUL_I24) NODE_NAME_CASE(MAD_U24) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index b53a9f611b4..952fd4cc502 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -69,6 +69,9 @@ private: SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performCtlzCombine(SDLoc SL, SDValue Cond, SDValue LHS, SDValue RHS, + DAGCombinerInfo &DCI) const; + SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const; protected: static EVT getEquivalentMemType(LLVMContext &Context, EVT VT); @@ -265,6 +268,7 @@ enum NodeType : unsigned { BFE_I32, // Extract range of bits with sign extension to 32-bits. BFI, // (src0 & src1) | (~src0 & src2) BFM, // Insert a range of bits into a 32-bit word. + FFBH_U32, // ctlz with -1 if input is zero. MUL_U24, MUL_I24, MAD_U24, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index 70e589c2842..b7a263e34e7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -191,6 +191,8 @@ def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>; def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>; def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>; +def AMDGPUffbh_u32 : SDNode<"AMDGPUISD::FFBH_U32", SDTIntUnaryOp>; + // Signed and unsigned 24-bit mulitply. The highest 8-bits are ignore when // performing the mulitply. The result is a 32-bit value. def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp, diff --git a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td index 779a14e95d2..2245f1417e5 100644 --- a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td +++ b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td @@ -349,7 +349,7 @@ def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>; def ADDC_UINT : R600_2OP_Helper <0x52, "ADDC_UINT", AMDGPUcarry>; def SUBB_UINT : R600_2OP_Helper <0x53, "SUBB_UINT", AMDGPUborrow>; -def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", ctlz_zero_undef, VecALU>; +def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", AMDGPUffbh_u32, VecALU>; def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>; let hasSideEffects = 1 in { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index f7a9880752f..e85d78a69ee 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2026,7 +2026,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case ISD::UINT_TO_FP: { return performUCharToFloatCombine(N, DCI); - + } case ISD::FADD: { if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) break; @@ -2108,7 +2108,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, break; } - } case ISD::LOAD: case ISD::STORE: case ISD::ATOMIC_LOAD: diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index b7df058b7c0..89692ab71f4 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -144,7 +144,7 @@ defm S_FF1_I32_B32 : SOP1_32 <sop1<0x13, 0x10>, "s_ff1_i32_b32", defm S_FF1_I32_B64 : SOP1_32_64 <sop1<0x14, 0x11>, "s_ff1_i32_b64", []>; defm S_FLBIT_I32_B32 : SOP1_32 <sop1<0x15, 0x12>, "s_flbit_i32_b32", - [(set i32:$dst, (ctlz_zero_undef i32:$src0))] + [(set i32:$dst, (AMDGPUffbh_u32 i32:$src0))] >; defm S_FLBIT_I32_B64 : SOP1_32_64 <sop1<0x16, 0x13>, "s_flbit_i32_b64", []>; |

