diff options
author | Tim Renouf <tpr.llvm@botech.co.uk> | 2019-03-18 19:35:44 +0000 |
---|---|---|
committer | Tim Renouf <tpr.llvm@botech.co.uk> | 2019-03-18 19:35:44 +0000 |
commit | cfdfba996b081092814d9b0856fcb8b2e12f73e7 (patch) | |
tree | d748fb4f2debefc9a2f7a358095f9fc0a5f237ec /llvm/lib | |
parent | 2e94f6e584d9dc95701d54d5dd47fad84d5bf985 (diff) | |
download | bcm5719-llvm-cfdfba996b081092814d9b0856fcb8b2e12f73e7.tar.gz bcm5719-llvm-cfdfba996b081092814d9b0856fcb8b2e12f73e7.zip |
[AMDGPU] Asm/disasm clamp modifier on vop3 int arithmetic
Allow the clamp modifier on vop3 int arithmetic instructions in assembly
and disassembly.
This involved adding a clamp operand to the affected instructions in MIR
and MC, and thus having to fix up several places in codegen and MIR
tests.
Differential Revision: https://reviews.llvm.org/D59267
Change-Id: Ic7775105f02a985b668fa658a0cd7837846a534e
llvm-svn: 356399
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 32 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 13 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.td | 7 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 12 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 9 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/VOP2Instructions.td | 29 |
6 files changed, 69 insertions, 33 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index bbe642eb32b..73c824aa153 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -932,7 +932,8 @@ void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) { AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; CurDAG->SelectNodeTo(N, Opc, N->getVTList(), - { N->getOperand(0), N->getOperand(1) }); + {N->getOperand(0), N->getOperand(1), + CurDAG->getConstant(0, {}, MVT::i1)/*clamp bit*/}); } void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) { @@ -1032,13 +1033,19 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, Zero, Addr.getOperand(1)); if (isDSOffsetLegal(Sub, ByteOffset, 16)) { + SmallVector<SDValue, 3> Opnds; + Opnds.push_back(Zero); + Opnds.push_back(Addr.getOperand(1)); + // FIXME: Select to VOP3 version for with-carry. - unsigned SubOp = Subtarget->hasAddNoCarry() ? - AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32; + unsigned SubOp = AMDGPU::V_SUB_I32_e32; + if (Subtarget->hasAddNoCarry()) { + SubOp = AMDGPU::V_SUB_U32_e64; + Opnds.push_back(Zero); // clamp bit + } - MachineSDNode *MachineSub - = CurDAG->getMachineNode(SubOp, DL, MVT::i32, - Zero, Addr.getOperand(1)); + MachineSDNode *MachineSub = + CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds); Base = SDValue(MachineSub, 0); Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16); @@ -1106,12 +1113,17 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, Zero, Addr.getOperand(1)); if (isDSOffsetLegal(Sub, DWordOffset1, 8)) { - unsigned SubOp = Subtarget->hasAddNoCarry() ? - AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32; + SmallVector<SDValue, 3> Opnds; + Opnds.push_back(Zero); + Opnds.push_back(Addr.getOperand(1)); + unsigned SubOp = AMDGPU::V_SUB_I32_e32; + if (Subtarget->hasAddNoCarry()) { + SubOp = AMDGPU::V_SUB_U32_e64; + Opnds.push_back(Zero); // clamp bit + } MachineSDNode *MachineSub - = CurDAG->getMachineNode(SubOp, DL, MVT::i32, - Zero, Addr.getOperand(1)); + = CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds); Base = SDValue(MachineSub, 0); Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 196ecd70f0d..abd324e3892 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1092,7 +1092,8 @@ unsigned SIInstrInfo::calculateLDSSpillAddress( // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z getAddNoCarry(Entry, Insert, DL, TIDReg) .addReg(TIDReg) - .addReg(TIDIGZReg); + .addReg(TIDIGZReg) + .addImm(0); // clamp bit } else { // Get the wave id BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), @@ -1117,7 +1118,8 @@ unsigned SIInstrInfo::calculateLDSSpillAddress( unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize); getAddNoCarry(MBB, MI, DL, TmpReg) .addImm(LDSOffset) - .addReg(TIDReg); + .addReg(TIDReg) + .addImm(0); // clamp bit return TmpReg; } @@ -4443,6 +4445,7 @@ bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, Inst.RemoveOperand(3); Inst.setDesc(get(NewOpc)); + Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit Inst.addImplicitDefUseOperands(*MBB.getParent()); MRI.replaceRegWith(OldDstReg, ResultReg); legalizeOperands(Inst, MDT); @@ -4703,7 +4706,8 @@ void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist, BuildMI(MBB, MII, DL, get(LoOpc), DestSub0) .addReg(CarryReg, RegState::Define) .add(SrcReg0Sub0) - .add(SrcReg1Sub0); + .add(SrcReg1Sub0) + .addImm(0); // clamp bit unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; MachineInstr *HiHalf = @@ -4711,7 +4715,8 @@ void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist, .addReg(DeadCarryReg, RegState::Define | RegState::Dead) .add(SrcReg0Sub1) .add(SrcReg1Sub1) - .addReg(CarryReg, RegState::Kill); + .addReg(CarryReg, RegState::Kill) + .addImm(0); // clamp bit BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) .addReg(DestSub0) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 680c287e0e9..336404fd63a 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1760,10 +1760,12 @@ def PatGenMode { int Pattern = 1; } -class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0> { +class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0, + bit _EnableClamp = 0> { field list<ValueType> ArgVT = _ArgVT; field bit EnableF32SrcMods = _EnableF32SrcMods; + field bit EnableClamp = _EnableClamp; field ValueType DstVT = ArgVT[0]; field ValueType Src0VT = ArgVT[1]; @@ -1817,7 +1819,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0> { field bit HasSrc1Mods = !if(HasModifiers, BitOr<HasSrc1FloatMods, HasSrc1IntMods>.ret, 0); field bit HasSrc2Mods = !if(HasModifiers, BitOr<HasSrc2FloatMods, HasSrc2IntMods>.ret, 0); - field bit HasClamp = isModifierType<Src0VT>.ret; + field bit HasClamp = BitOr<isModifierType<Src0VT>.ret, EnableClamp>.ret; field bit HasSDWAClamp = EmitDst; field bit HasFPClamp = BitAnd<isFloatType<DstVT>.ret, HasClamp>.ret; field bit HasIntClamp = !if(isFloatType<DstVT>.ret, 0, HasClamp); @@ -1943,6 +1945,7 @@ def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>; def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>; def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>; def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>; +def VOP_I32_I32_I32_ARITH : VOPProfile <[i32, i32, i32, untyped], 0, /*EnableClamp=*/1>; def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>; def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>; diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index f6bb7b3196f..d663616f02d 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -723,7 +723,8 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) { TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) .addReg(ImmReg) - .addReg(AddrReg->getReg(), 0, BaseSubReg); + .addReg(AddrReg->getReg(), 0, BaseSubReg) + .addImm(0); // clamp bit BaseSubReg = 0; } @@ -816,7 +817,8 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) { TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) .addReg(ImmReg) - .addReg(AddrReg->getReg(), 0, BaseSubReg); + .addReg(AddrReg->getReg(), 0, BaseSubReg) + .addImm(0); // clamp bit BaseSubReg = 0; } @@ -1144,7 +1146,8 @@ unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI, BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0) .addReg(CarryReg, RegState::Define) .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) - .add(OffsetLo); + .add(OffsetLo) + .addImm(0); // clamp bit (void)LoHalf; LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); @@ -1153,7 +1156,8 @@ unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI, .addReg(DeadCarryReg, RegState::Define | RegState::Dead) .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) .add(OffsetHi) - .addReg(CarryReg, RegState::Kill); + .addReg(CarryReg, RegState::Kill) + .addImm(0); // clamp bit (void)HiHalf; LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 726e73c7f25..c3e13a6cebd 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -334,7 +334,8 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, TII->getAddNoCarry(*MBB, Ins, DL, BaseReg) .addReg(OffsetReg, RegState::Kill) - .addReg(FIReg); + .addReg(FIReg) + .addImm(0); // clamp bit } void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, @@ -1108,7 +1109,8 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { TII->getAddNoCarry(*MBB, MI, DL, ResultReg) .addImm(Offset) - .addReg(ScaledReg, RegState::Kill); + .addReg(ScaledReg, RegState::Kill) + .addImm(0); // clamp bit } else { unsigned ConstOffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); @@ -1117,7 +1119,8 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, .addImm(Offset); TII->getAddNoCarry(*MBB, MI, DL, ResultReg) .addReg(ConstOffsetReg, RegState::Kill) - .addReg(ScaledReg, RegState::Kill); + .addReg(ScaledReg, RegState::Kill) + .addImm(0); // clamp bit } } diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 1cb9bdb77ab..27db3f32f91 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -277,9 +277,9 @@ def VOP_MAC_F16 : VOP_MAC <f16>; def VOP_MAC_F32 : VOP_MAC <f32>; // Write out to vcc or arbitrary SGPR. -def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> { +def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped], 0, /*EnableClamp=*/1> { let Asm32 = "$vdst, vcc, $src0, $src1"; - let Asm64 = "$vdst, $sdst, $src0, $src1"; + let Asm64 = "$vdst, $sdst, $src0, $src1$clamp"; let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel"; let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel"; let AsmDPP = "$vdst, vcc, $src0, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; @@ -289,7 +289,7 @@ def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> { // Write out to vcc or arbitrary SGPR and read in from vcc or // arbitrary SGPR. -def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { +def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], 0, /*EnableClamp=*/1> { // We use VCSrc_b32 to exclude literal constants, even though the // encoding normally allows them since the implicit VCC use means // using one would always violate the constant bus @@ -297,7 +297,7 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { // technically be possible to use VCC again as src0. let Src0RC32 = VCSrc_b32; let Asm32 = "$vdst, vcc, $src0, $src1, vcc"; - let Asm64 = "$vdst, $sdst, $src0, $src1, $src2"; + let Asm64 = "$vdst, $sdst, $src0, $src1, $src2$clamp"; let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel"; let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel"; let AsmDPP = "$vdst, vcc, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; @@ -440,9 +440,9 @@ defm V_SUBBREV_U32 : VOP2bInst <"v_subbrev_u32", VOP2b_I32_I1_I32_I32_I1, null_f let SubtargetPredicate = HasAddNoCarryInsts in { -defm V_ADD_U32 : VOP2Inst <"v_add_u32", VOP_I32_I32_I32, null_frag, "v_add_u32", 1>; -defm V_SUB_U32 : VOP2Inst <"v_sub_u32", VOP_I32_I32_I32, null_frag, "v_sub_u32", 1>; -defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32, null_frag, "v_sub_u32", 1>; +defm V_ADD_U32 : VOP2Inst <"v_add_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_add_u32", 1>; +defm V_SUB_U32 : VOP2Inst <"v_sub_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>; +defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>; } } // End isCommutable = 1 @@ -473,12 +473,12 @@ defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_V2I16_I32_I def : GCNPat< (AMDGPUadde i32:$src0, i32:$src1, i1:$src2), - (V_ADDC_U32_e64 $src0, $src1, $src2) + (V_ADDC_U32_e64 $src0, $src1, $src2, 0) >; def : GCNPat< (AMDGPUsube i32:$src0, i32:$src1, i1:$src2), - (V_SUBB_U32_e64 $src0, $src1, $src2) + (V_SUBB_U32_e64 $src0, $src1, $src2, 0) >; // These instructions only exist on SI and CI @@ -505,6 +505,15 @@ class DivergentBinOp<SDPatternOperator Op, VOP_Pseudo Inst> : ) >; +class DivergentClampingBinOp<SDPatternOperator Op, VOP_Pseudo Inst> : + GCNPat< + (getDivergentFrag<Op>.ret Inst.Pfl.Src0VT:$src0, Inst.Pfl.Src1VT:$src1), + !if(!cast<Commutable_REV>(Inst).IsOrig, + (Inst $src0, $src1, 0), + (Inst $src1, $src0, 0) + ) + >; + let AddedComplexity = 1 in { def : DivergentBinOp<srl, V_LSHRREV_B32_e64>; def : DivergentBinOp<sra, V_ASHRREV_I32_e64>; @@ -520,7 +529,7 @@ let SubtargetPredicate = HasAddNoCarryInsts in { def : DivergentBinOp<add, V_ADD_I32_e32>; -def : DivergentBinOp<add, V_ADD_I32_e64>; +def : DivergentClampingBinOp<add, V_ADD_I32_e64>; def : DivergentBinOp<sub, V_SUB_I32_e32>; def : DivergentBinOp<sub, V_SUBREV_I32_e32>; |