diff options
| author | Sam Parker <sam.parker@arm.com> | 2017-03-15 08:27:11 +0000 |
|---|---|---|
| committer | Sam Parker <sam.parker@arm.com> | 2017-03-15 08:27:11 +0000 |
| commit | 654cb8263a3c3c8ec24285e260da3ab0a0c7abf0 (patch) | |
| tree | adf5cdef8a2e4a8d4bb6d2b9a5d61571ebc94e01 /llvm/lib | |
| parent | 1b192336d8a9fe747a5cbac173037f4c215802c9 (diff) | |
| download | bcm5719-llvm-654cb8263a3c3c8ec24285e260da3ab0a0c7abf0.tar.gz bcm5719-llvm-654cb8263a3c3c8ec24285e260da3ab0a0c7abf0.zip | |
[ARM] Enable SMLAL[B|T] isel
Enable the selection of the 64-bit signed multiply accumulate
instructions which operate on 16-bit operands. These are enabled for
ARMv5TE onwards for ARM and for V6T2 and other DSP enabled Thumb
architectures.
Differential Revision: https://reviews.llvm.org/D30044
llvm-svn: 297809
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/Target/ARM/ARMISelLowering.cpp | 133 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM/ARMISelLowering.h | 4 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM/ARMInstrInfo.td | 56 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM/ARMInstrThumb2.td | 29 |
4 files changed, 182 insertions, 40 deletions
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 62953f4be18..15ae0c7940b 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1337,6 +1337,10 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::UMAAL: return "ARMISD::UMAAL"; case ARMISD::UMLAL: return "ARMISD::UMLAL"; case ARMISD::SMLAL: return "ARMISD::SMLAL"; + case ARMISD::SMLALBB: return "ARMISD::SMLALBB"; + case ARMISD::SMLALBT: return "ARMISD::SMLALBT"; + case ARMISD::SMLALTB: return "ARMISD::SMLALTB"; + case ARMISD::SMLALTT: return "ARMISD::SMLALTT"; case ARMISD::SMULWB: return "ARMISD::SMULWB"; case ARMISD::SMULWT: return "ARMISD::SMULWT"; case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; @@ -9497,8 +9501,90 @@ static SDValue findMUL_LOHI(SDValue V) { return SDValue(); } +static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + + if (Subtarget->isThumb()) { + if (!Subtarget->hasDSP()) + return SDValue(); + } else if (!Subtarget->hasV5TEOps()) + return SDValue(); + + // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and + // accumulates the product into a 64-bit value. The 16-bit values will + // be sign extended somehow or SRA'd into 32-bit values + // (addc (adde (mul 16bit, 16bit), lo), hi) + SDValue Mul = AddcNode->getOperand(0); + SDValue Hi = AddcNode->getOperand(1); + if (Mul.getOpcode() != ISD::MUL) { + Hi = AddcNode->getOperand(0); + Mul = AddcNode->getOperand(1); + if (Mul.getOpcode() != ISD::MUL) + return SDValue(); + } + + SDValue SRA = AddeNode->getOperand(0); + SDValue Lo = AddeNode->getOperand(1); + if (SRA.getOpcode() != ISD::SRA) { + SRA = AddeNode->getOperand(1); + Lo = AddeNode->getOperand(0); + if (SRA.getOpcode() != ISD::SRA) + return SDValue(); + } + if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) { + if (Const->getZExtValue() != 31) + return SDValue(); + } else + return SDValue(); + + if (SRA.getOperand(0) != Mul) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(AddcNode); + unsigned Opcode = 0; + SDValue Op0; + SDValue Op1; + + if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) { + Opcode = ARMISD::SMLALBB; + Op0 = Mul.getOperand(0); + Op1 = Mul.getOperand(1); + } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) { + Opcode = ARMISD::SMLALBT; + Op0 = Mul.getOperand(0); + Op1 = Mul.getOperand(1).getOperand(0); + } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) { + Opcode = ARMISD::SMLALTB; + Op0 = Mul.getOperand(0).getOperand(0); + Op1 = Mul.getOperand(1); + } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) { + Opcode = ARMISD::SMLALTT; + Op0 = Mul->getOperand(0).getOperand(0); + Op1 = Mul->getOperand(1).getOperand(0); + } + + if (!Op0 || !Op1) + return SDValue(); + + SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32), + Op0, Op1, Lo, Hi); + // Replace the ADDs' nodes uses by the MLA node's values. + SDValue HiMLALResult(SMLAL.getNode(), 1); + SDValue LoMLALResult(SMLAL.getNode(), 0); + + DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult); + DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult); + + // Return original node to notify the driver to stop replacing. + SDValue resNode(AddcNode, 0); + return resNode; +} + static SDValue AddCombineTo64bitMLAL(SDNode *AddeNode, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { // Look for multiply add opportunities. // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where // each add nodes consumes a value from ISD::UMUL_LOHI and there is @@ -9535,12 +9621,13 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddeNode, AddcNode->getValueType(0) == MVT::i32 && "Expect ADDC with two result values. First: i32"); - // Check that the ADDC adds the low result of the S/UMUL_LOHI. + // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it + // maybe a SMLAL which multiplies two 16-bit values. if (AddcOp0->getOpcode() != ISD::UMUL_LOHI && AddcOp0->getOpcode() != ISD::SMUL_LOHI && AddcOp1->getOpcode() != ISD::UMUL_LOHI && AddcOp1->getOpcode() != ISD::SMUL_LOHI) - return SDValue(); + return AddCombineTo64BitSMLAL16(AddcNode, AddeNode, DCI, Subtarget); // Check for the triangle shape. SDValue AddeOp0 = AddeNode->getOperand(0); @@ -9628,7 +9715,7 @@ static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, // as the addend, and it's handled in PerformUMLALCombine. if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) - return AddCombineTo64bitMLAL(AddeNode, DCI); + return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); // Check that we have a glued ADDC node. SDNode* AddcNode = AddeNode->getOperand(2).getNode(); @@ -9645,7 +9732,7 @@ static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, UmlalNode = AddcNode->getOperand(1).getNode(); AddHi = AddcNode->getOperand(0); } else { - return AddCombineTo64bitMLAL(AddeNode, DCI); + return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); } // The ADDC should be glued to an ADDE node, which uses the same UMLAL as @@ -11894,6 +11981,42 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, return SDValue(); break; } + case ARMISD::SMLALBB: { + unsigned BitWidth = N->getValueType(0).getSizeInBits(); + APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); + if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || + (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) + return SDValue(); + break; + } + case ARMISD::SMLALBT: { + unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits(); + APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); + unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits(); + APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); + if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) || + (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI))) + return SDValue(); + break; + } + case ARMISD::SMLALTB: { + unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits(); + APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); + unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits(); + APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); + if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) || + (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI))) + return SDValue(); + break; + } + case ARMISD::SMLALTT: { + unsigned BitWidth = N->getValueType(0).getSizeInBits(); + APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); + if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || + (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) + return SDValue(); + break; + } case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 20a678e4c7f..180558e936a 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -180,6 +180,10 @@ class InstrItineraryData; UMLAL, // 64bit Unsigned Accumulate Multiply SMLAL, // 64bit Signed Accumulate Multiply UMAAL, // 64-bit Unsigned Accumulate Accumulate Multiply + SMLALBB, // 64-bit signed accumulate multiply bottom, bottom 16 + SMLALBT, // 64-bit signed accumulate multiply bottom, top 16 + SMLALTB, // 64-bit signed accumulate multiply top, bottom 16 + SMLALTT, // 64-bit signed accumulate multiply top, top 16 // Operands of the standard BUILD_VECTOR node are not legalized, which // is fine if BUILD_VECTORs are always lowered to shuffles or other diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td index 164c895dc61..dd2ab5392cc 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -92,6 +92,13 @@ def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3, SDTCisVT<1, i32>, SDTCisVT<4, i32>]>; +def SDT_LongMac : SDTypeProfile<2, 4, [SDTCisVT<0, i32>, + SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisSameAs<0, 4>, + SDTCisSameAs<0, 5>]>; + // Node definitions. def ARMWrapper : SDNode<"ARMISD::Wrapper", SDTIntUnaryOp>; def ARMWrapperPIC : SDNode<"ARMISD::WrapperPIC", SDTIntUnaryOp>; @@ -185,6 +192,10 @@ def ARMmemcopy : SDNode<"ARMISD::MEMCPY", SDT_ARMMEMCPY, def ARMsmulwb : SDNode<"ARMISD::SMULWB", SDTIntBinOp, []>; def ARMsmulwt : SDNode<"ARMISD::SMULWT", SDTIntBinOp, []>; +def ARMsmlalbb : SDNode<"ARMISD::SMLALBB", SDT_LongMac, []>; +def ARMsmlalbt : SDNode<"ARMISD::SMLALBT", SDT_LongMac, []>; +def ARMsmlaltb : SDNode<"ARMISD::SMLALTB", SDT_LongMac, []>; +def ARMsmlaltt : SDNode<"ARMISD::SMLALTT", SDT_LongMac, []>; //===----------------------------------------------------------------------===// // ARM Instruction Predicate Definitions. @@ -4183,29 +4194,28 @@ defm SMUL : AI_smul<"smul">; defm SMLA : AI_smla<"smla">; // Halfword multiply accumulate long: SMLAL<x><y>. -def SMLALBB : AMulxyI64<0b0001010, 0b00, (outs GPRnopc:$RdLo, GPRnopc:$RdHi), - (ins GPRnopc:$Rn, GPRnopc:$Rm), - IIC_iMAC64, "smlalbb", "\t$RdLo, $RdHi, $Rn, $Rm", []>, - Requires<[IsARM, HasV5TE]>, - Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>; - -def SMLALBT : AMulxyI64<0b0001010, 0b10, (outs GPRnopc:$RdLo, GPRnopc:$RdHi), - (ins GPRnopc:$Rn, GPRnopc:$Rm), - IIC_iMAC64, "smlalbt", "\t$RdLo, $RdHi, $Rn, $Rm", []>, - Requires<[IsARM, HasV5TE]>, - Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>; - -def SMLALTB : AMulxyI64<0b0001010, 0b01, (outs GPRnopc:$RdLo, GPRnopc:$RdHi), - (ins GPRnopc:$Rn, GPRnopc:$Rm), - IIC_iMAC64, "smlaltb", "\t$RdLo, $RdHi, $Rn, $Rm", []>, - Requires<[IsARM, HasV5TE]>, - Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>; - -def SMLALTT : AMulxyI64<0b0001010, 0b11, (outs GPRnopc:$RdLo, GPRnopc:$RdHi), - (ins GPRnopc:$Rn, GPRnopc:$Rm), - IIC_iMAC64, "smlaltt", "\t$RdLo, $RdHi, $Rn, $Rm", []>, - Requires<[IsARM, HasV5TE]>, - Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>; +class SMLAL<bits<2> opc1, string asm> + : AMulxyI64<0b0001010, opc1, + (outs GPRnopc:$RdLo, GPRnopc:$RdHi), + (ins GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$RLo, GPRnopc:$RHi), + IIC_iMAC64, asm, "\t$RdLo, $RdHi, $Rn, $Rm", []>, + RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, + Requires<[IsARM, HasV5TE]>, + Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>; + +def SMLALBB : SMLAL<0b00, "smlalbb">; +def SMLALBT : SMLAL<0b10, "smlalbt">; +def SMLALTB : SMLAL<0b01, "smlaltb">; +def SMLALTT : SMLAL<0b11, "smlaltt">; + +def : ARMV5TEPat<(ARMsmlalbb GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), + (SMLALBB $Rn, $Rm, $RLo, $RHi)>; +def : ARMV5TEPat<(ARMsmlalbt GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), + (SMLALBT $Rn, $Rm, $RLo, $RHi)>; +def : ARMV5TEPat<(ARMsmlaltb GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), + (SMLALTB $Rn, $Rm, $RLo, $RHi)>; +def : ARMV5TEPat<(ARMsmlaltt GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), + (SMLALTT $Rn, $Rm, $RLo, $RHi)>; // Helper class for AI_smld. class AMulDualIbase<bit long, bit sub, bit swap, dag oops, dag iops, diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td index 194505f7586..6b97bbf1a13 100644 --- a/llvm/lib/Target/ARM/ARMInstrThumb2.td +++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td @@ -2732,19 +2732,24 @@ def : Thumb2DSPMulPat<(add rGPR:$Ra, (mul (sra rGPR:$Rn, (i32 16)), sext_16_node:$Rm)), (t2SMLATB rGPR:$Rn, rGPR:$Rm, rGPR:$Ra)>; -class T2SMLAL<bits<3> op22_20, bits<4> op7_4, string opc, list<dag> pattern> - : T2FourReg_mac<1, op22_20, op7_4, - (outs rGPR:$Ra, rGPR:$Rd), - (ins rGPR:$Rn, rGPR:$Rm), - IIC_iMAC64, opc, "\t$Ra, $Rd, $Rn, $Rm", []>, - Requires<[IsThumb2, HasDSP]>, - Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>; - // Halfword multiple accumulate long: SMLAL<x><y> -def t2SMLALBB : T2SMLAL<0b100, 0b1000, "smlalbb", []>; -def t2SMLALBT : T2SMLAL<0b100, 0b1001, "smlalbt", []>; -def t2SMLALTB : T2SMLAL<0b100, 0b1010, "smlaltb", []>; -def t2SMLALTT : T2SMLAL<0b100, 0b1011, "smlaltt", []>; +def t2SMLALBB : T2MlaLong<0b100, 0b1000, "smlalbb">, + Requires<[IsThumb2, HasDSP]>; +def t2SMLALBT : T2MlaLong<0b100, 0b1001, "smlalbt">, + Requires<[IsThumb2, HasDSP]>; +def t2SMLALTB : T2MlaLong<0b100, 0b1010, "smlaltb">, + Requires<[IsThumb2, HasDSP]>; +def t2SMLALTT : T2MlaLong<0b100, 0b1011, "smlaltt">, + Requires<[IsThumb2, HasDSP]>; + +def : Thumb2DSPPat<(ARMsmlalbb GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), + (t2SMLALBB $Rn, $Rm, $RLo, $RHi)>; +def : Thumb2DSPPat<(ARMsmlalbt GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), + (t2SMLALBT $Rn, $Rm, $RLo, $RHi)>; +def : Thumb2DSPPat<(ARMsmlaltb GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), + (t2SMLALTB $Rn, $Rm, $RLo, $RHi)>; +def : Thumb2DSPPat<(ARMsmlaltt GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), + (t2SMLALTT $Rn, $Rm, $RLo, $RHi)>; class T2DualHalfMul<bits<3> op22_20, bits<4> op7_4, string opc> : T2ThreeReg_mac<0, op22_20, op7_4, |

