diff options
| author | Sam Parker <sam.parker@arm.com> | 2019-07-23 14:08:46 +0000 |
|---|---|---|
| committer | Sam Parker <sam.parker@arm.com> | 2019-07-23 14:08:46 +0000 |
| commit | 57e87dd81beb8df988bfa8e3369a2c7a888da093 (patch) | |
| tree | 8facbd02bd1d3e05d92ffe9479703223dc92e3e9 /llvm/lib | |
| parent | c60c12fb10d49fbd7fbdc8e8ddc3134e16e0d1e7 (diff) | |
| download | bcm5719-llvm-57e87dd81beb8df988bfa8e3369a2c7a888da093.tar.gz bcm5719-llvm-57e87dd81beb8df988bfa8e3369a2c7a888da093.zip | |
[ARM][LowOverheadLoops] Fix branch target codegen
While lowering test.set.loop.iterations, it wasn't checked how the
brcond was using the result and so the wls could branch to the loop
preheader instead of not entering it. The same was true for
loop.decrement.reg.
So brcond and br_cc and now lowered manually when using the hwloop
intrinsics. During this we now check whether the result has been
negated and whether we're using SETEQ or SETNE and 0 or 1. We can
then figure out which basic block the WLS and LE should be targeting.
Differential Revision: https://reviews.llvm.org/D64616
llvm-svn: 366809
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp | 25 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM/ARMISelLowering.cpp | 183 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM/ARMISelLowering.h | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM/ARMInstrInfo.td | 10 |
4 files changed, 183 insertions, 37 deletions
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index b349627b67b..d0cd56fd5a3 100644 --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -2998,13 +2998,26 @@ void ARMDAGToDAGISel::Select(SDNode *N) { // Other cases are autogenerated. break; } - case ARMISD::WLS: { - SDValue Ops[] = { N->getOperand(1), // Loop count - N->getOperand(2), // Exit target + case ARMISD::WLS: + case ARMISD::LE: { + SDValue Ops[] = { N->getOperand(1), + N->getOperand(2), N->getOperand(0) }; - SDNode *LoopStart = - CurDAG->getMachineNode(ARM::t2WhileLoopStart, dl, MVT::Other, Ops); - ReplaceUses(N, LoopStart); + unsigned Opc = N->getOpcode() == ARMISD::WLS ? + ARM::t2WhileLoopStart : ARM::t2LoopEnd; + SDNode *New = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); + ReplaceUses(N, New); + CurDAG->RemoveDeadNode(N); + return; + } + case ARMISD::LOOP_DEC: { + SDValue Ops[] = { N->getOperand(1), + N->getOperand(2), + N->getOperand(0) }; + SDNode *Dec = + CurDAG->getMachineNode(ARM::t2LoopDec, dl, + CurDAG->getVTList(MVT::i32, MVT::Other), Ops); + ReplaceUses(N, Dec); CurDAG->RemoveDeadNode(N); return; } diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index fe620e02bfa..222b5bca7a6 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -669,8 +669,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, addMVEVectorTypes(Subtarget->hasMVEFloatOps()); // Combine low-overhead loop intrinsics so that we can lower i1 types. - if (Subtarget->hasLOB()) + if (Subtarget->hasLOB()) { setTargetDAGCombine(ISD::BRCOND); + setTargetDAGCombine(ISD::BR_CC); + } if (Subtarget->hasNEON()) { addDRTypeForNEON(MVT::v2f32); @@ -1589,6 +1591,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD"; case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD"; case ARMISD::WLS: return "ARMISD::WLS"; + case ARMISD::LE: return "ARMISD::LE"; + case ARMISD::LOOP_DEC: return "ARMISD::LOOP_DEC"; } return nullptr; } @@ -13034,43 +13038,169 @@ SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &D return V; } +// Given N, the value controlling the conditional branch, search for the loop +// intrinsic, returning it, along with how the value is used. We need to handle +// patterns such as the following: +// (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit) +// (brcond (setcc (loop.decrement), 0, eq), exit) +// (brcond (setcc (loop.decrement), 0, ne), header) +static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm, + bool &Negate) { + switch (N->getOpcode()) { + default: + break; + case ISD::XOR: { + if (!isa<ConstantSDNode>(N.getOperand(1))) + return SDValue(); + if (!cast<ConstantSDNode>(N.getOperand(1))->isOne()) + return SDValue(); + Negate = !Negate; + return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate); + } + case ISD::SETCC: { + auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1)); + if (!Const) + return SDValue(); + if (Const->isNullValue()) + Imm = 0; + else if (Const->isOne()) + Imm = 1; + else + return SDValue(); + CC = cast<CondCodeSDNode>(N.getOperand(2))->get(); + return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate); + } + case ISD::INTRINSIC_W_CHAIN: { + unsigned IntOp = cast<ConstantSDNode>(N.getOperand(1))->getZExtValue(); + if (IntOp != Intrinsic::test_set_loop_iterations && + IntOp != Intrinsic::loop_decrement_reg) + return SDValue(); + return N; + } + } + return SDValue(); +} + static SDValue PerformHWLoopCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST) { - // Look for (brcond (xor test.set.loop.iterations, -1) - SDValue CC = N->getOperand(1); - unsigned Opc = CC->getOpcode(); - SDValue Int; - if ((Opc == ISD::XOR || Opc == ISD::SETCC) && - (CC->getOperand(0)->getOpcode() == ISD::INTRINSIC_W_CHAIN)) { + // The hwloop intrinsics that we're interested are used for control-flow, + // either for entering or exiting the loop: + // - test.set.loop.iterations will test whether its operand is zero. If it + // is zero, the proceeding branch should not enter the loop. + // - loop.decrement.reg also tests whether its operand is zero. If it is + // zero, the proceeding branch should not branch back to the beginning of + // the loop. + // So here, we need to check that how the brcond is using the result of each + // of the intrinsics to ensure that we're branching to the right place at the + // right time. + + ISD::CondCode CC; + SDValue Cond; + int Imm = 1; + bool Negate = false; + SDValue Chain = N->getOperand(0); + SDValue Dest; - assert((isa<ConstantSDNode>(CC->getOperand(1)) && - cast<ConstantSDNode>(CC->getOperand(1))->isOne()) && - "Expected to compare against 1"); + if (N->getOpcode() == ISD::BRCOND) { + CC = ISD::SETEQ; + Cond = N->getOperand(1); + Dest = N->getOperand(2); + } else { + assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!"); + CC = cast<CondCodeSDNode>(N->getOperand(1))->get(); + Cond = N->getOperand(2); + Dest = N->getOperand(4); + if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) { + if (!Const->isOne() && !Const->isNullValue()) + return SDValue(); + Imm = Const->getZExtValue(); + } else + return SDValue(); + } - Int = CC->getOperand(0); - } else if (CC->getOpcode() == ISD::INTRINSIC_W_CHAIN) - Int = CC; - else + SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate); + if (!Int) return SDValue(); - unsigned IntOp = cast<ConstantSDNode>(Int.getOperand(1))->getZExtValue(); - if (IntOp != Intrinsic::test_set_loop_iterations) - return SDValue(); + if (Negate) + CC = ISD::getSetCCInverse(CC, true); + + auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) { + return (CC == ISD::SETEQ && Imm == 0) || + (CC == ISD::SETNE && Imm == 1) || + (CC == ISD::SETLT && Imm == 1) || + (CC == ISD::SETULT && Imm == 1); + }; + + auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) { + return (CC == ISD::SETEQ && Imm == 1) || + (CC == ISD::SETNE && Imm == 0) || + (CC == ISD::SETGT && Imm == 0) || + (CC == ISD::SETUGT && Imm == 0) || + (CC == ISD::SETGE && Imm == 1) || + (CC == ISD::SETUGE && Imm == 1); + }; + + assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) && + "unsupported condition"); SDLoc dl(Int); - SDValue Chain = N->getOperand(0); + SelectionDAG &DAG = DCI.DAG; SDValue Elements = Int.getOperand(2); - SDValue ExitBlock = N->getOperand(2); + unsigned IntOp = cast<ConstantSDNode>(Int->getOperand(1))->getZExtValue(); + assert((N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BR) + && "expected single br user"); + SDNode *Br = *N->use_begin(); + SDValue OtherTarget = Br->getOperand(1); + + // Update the unconditional branch to branch to the given Dest. + auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) { + SDValue NewBrOps[] = { Br->getOperand(0), Dest }; + SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps); + DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr); + }; - // TODO: Once we start supporting tail predication, we can add another - // operand to WLS for the number of elements processed in a vector loop. + if (IntOp == Intrinsic::test_set_loop_iterations) { + SDValue Res; + // We expect this 'instruction' to branch when the counter is zero. + if (IsTrueIfZero(CC, Imm)) { + SDValue Ops[] = { Chain, Elements, Dest }; + Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); + } else { + // The logic is the reverse of what we need for WLS, so find the other + // basic block target: the target of the proceeding br. + UpdateUncondBr(Br, Dest, DAG); - SDValue Ops[] = { Chain, Elements, ExitBlock }; - SDValue Res = DCI.DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); - DCI.DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0)); - return Res; + SDValue Ops[] = { Chain, Elements, OtherTarget }; + Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); + } + DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0)); + return Res; + } else { + SDValue Size = DAG.getTargetConstant( + cast<ConstantSDNode>(Int.getOperand(3))->getZExtValue(), dl, MVT::i32); + SDValue Args[] = { Int.getOperand(0), Elements, Size, }; + SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl, + DAG.getVTList(MVT::i32, MVT::Other), Args); + DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode()); + + // We expect this instruction to branch when the count is not zero. + SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget; + + // Update the unconditional branch to target the loop preheader if we've + // found the condition has been reversed. + if (Target == OtherTarget) + UpdateUncondBr(Br, Dest, DAG); + + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + SDValue(LoopDec.getNode(), 1), Chain); + + SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target }; + return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs); + } + return SDValue(); } /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND. @@ -13304,7 +13434,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ISD::OR: return PerformORCombine(N, DCI, Subtarget); case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget); case ISD::AND: return PerformANDCombine(N, DCI, Subtarget); - case ISD::BRCOND: return PerformHWLoopCombine(N, DCI, Subtarget); + case ISD::BRCOND: + case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget); case ARMISD::ADDC: case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget); case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index b14c6487209..84f2f7239fe 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -126,6 +126,8 @@ class VectorType; WIN__DBZCHK, // Windows' divide by zero check WLS, // Low-overhead loops, While Loop Start + LOOP_DEC, // Really a part of LE, performs the sub + LE, // Low-overhead loops, Loop End VCEQ, // Vector compare equal. VCEQZ, // Vector compare equal to zero. diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td index e3514546385..74f0c3dd964 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -108,8 +108,8 @@ def SDT_ARMIntShiftParts : SDTypeProfile<2, 3, [SDTCisSameAs<0, 1>, // TODO Add another operand for 'Size' so that we can re-use this node when we // start supporting *TP versions. -def SDT_ARMWhileLoop : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, - SDTCisVT<1, OtherVT>]>; +def SDT_ARMLoLoop : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, + SDTCisVT<1, OtherVT>]>; def ARMSmlald : SDNode<"ARMISD::SMLALD", SDT_LongMac>; def ARMSmlaldx : SDNode<"ARMISD::SMLALDX", SDT_LongMac>; @@ -265,9 +265,9 @@ def ARMvshruImm : SDNode<"ARMISD::VSHRuIMM", SDTARMVSHIMM>; def ARMvshls : SDNode<"ARMISD::VSHLs", SDTARMVSH>; def ARMvshlu : SDNode<"ARMISD::VSHLu", SDTARMVSH>; -def ARMWLS : SDNode<"ARMISD::WLS", SDT_ARMWhileLoop, - [SDNPHasChain]>; - +def ARMWLS : SDNode<"ARMISD::WLS", SDT_ARMLoLoop, [SDNPHasChain]>; +def ARMLE : SDNode<"ARMISD::LE", SDT_ARMLoLoop, [SDNPHasChain]>; +def ARMLoopDec : SDNode<"ARMISD::LOOP_DEC", SDTIntBinOp, [SDNPHasChain]>; //===----------------------------------------------------------------------===// // ARM Flag Definitions. |

