diff options
author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2018-08-28 18:34:24 +0000 |
---|---|---|
committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2018-08-28 18:34:24 +0000 |
commit | de6c421cc81f935b74f771638d674c745118ef8b (patch) | |
tree | f4bb7759d36288ef9f98cbee7bddaa570ea5ce28 /llvm/lib/Target | |
parent | ec71e018d65c0622861efb6c3e7789910afaa3c2 (diff) | |
download | bcm5719-llvm-de6c421cc81f935b74f771638d674c745118ef8b.tar.gz bcm5719-llvm-de6c421cc81f935b74f771638d674c745118ef8b.zip |
AMDGPU: Shrink insts to fold immediates
This needs to be done in the SSA fold operands
pass to be effective, so there is a bit of overlap
with SIShrinkInstructions but I don't think this
is practically avoidable.
llvm-svn: 340859
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 88 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 53 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.h | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp | 48 |
4 files changed, 138 insertions, 54 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 6c57926b7d1..1fda9701c39 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -35,13 +35,16 @@ struct FoldCandidate { uint64_t ImmToFold; int FrameIndexToFold; }; + int ShrinkOpcode; unsigned char UseOpNo; MachineOperand::MachineOperandType Kind; bool Commuted; FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp, - bool Commuted_ = false) : - UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()), + bool Commuted_ = false, + int ShrinkOp = -1) : + UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo), + Kind(FoldOp->getType()), Commuted(Commuted_) { if (FoldOp->isImm()) { ImmToFold = FoldOp->getImm(); @@ -68,6 +71,14 @@ struct FoldCandidate { bool isCommuted() const { return Commuted; } + + bool needsShrink() const { + return ShrinkOpcode != -1; + } + + int getShrinkOpcode() const { + return ShrinkOpcode; + } }; class SIFoldOperands : public MachineFunctionPass { @@ -154,6 +165,7 @@ FunctionPass *llvm::createSIFoldOperandsPass() { } static bool updateOperand(FoldCandidate &Fold, + const SIInstrInfo &TII, const TargetRegisterInfo &TRI) { MachineInstr *MI = Fold.UseMI; MachineOperand &Old = MI->getOperand(Fold.UseOpNo); @@ -189,10 +201,42 @@ static bool updateOperand(FoldCandidate &Fold, Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); } } + + if (Fold.needsShrink()) { + MachineBasicBlock *MBB = MI->getParent(); + auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI); + if (Liveness != MachineBasicBlock::LQR_Dead) + return false; + + int Op32 = Fold.getShrinkOpcode(); + MachineOperand &Dst0 = MI->getOperand(0); + MachineOperand &Dst1 = MI->getOperand(1); + assert(Dst0.isDef() && Dst1.isDef()); + + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg()); + unsigned NewReg0 = MRI.createVirtualRegister(Dst0RC); + const TargetRegisterClass *Dst1RC = MRI.getRegClass(Dst1.getReg()); + unsigned NewReg1 = MRI.createVirtualRegister(Dst1RC); + + MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32); + + // Keep the old instruction around to avoid breaking iterators, but + // replace the outputs with dummy registers. + Dst0.setReg(NewReg0); + Dst1.setReg(NewReg1); + + if (Fold.isCommuted()) + TII.commuteInstruction(*Inst32, false); + return true; + } + Old.ChangeToImmediate(Fold.ImmToFold); return true; } + assert(!Fold.needsShrink() && "not handled"); + if (Fold.isFI()) { Old.ChangeToFrameIndex(Fold.FrameIndexToFold); return true; @@ -261,6 +305,8 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, if (isUseMIInFoldList(FoldList, MI)) return false; + unsigned CommuteOpNo = OpNo; + // Operand is not legal, so try to commute the instruction to // see if this makes it possible to fold. unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex; @@ -269,11 +315,12 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, if (CanCommute) { if (CommuteIdx0 == OpNo) - OpNo = CommuteIdx1; + CommuteOpNo = CommuteIdx1; else if (CommuteIdx1 == OpNo) - OpNo = CommuteIdx0; + CommuteOpNo = CommuteIdx0; } + // One of operands might be an Imm operand, and OpNo may refer to it after // the call of commuteInstruction() below. Such situations are avoided // here explicitly as OpNo must be a register operand to be a candidate @@ -286,12 +333,39 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1)) return false; - if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) { + if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) { + if ((Opc == AMDGPU::V_ADD_I32_e64 || + Opc == AMDGPU::V_SUB_I32_e64 || + Opc == AMDGPU::V_SUBREV_I32_e64) && // FIXME + OpToFold->isImm()) { + MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + + // Verify the other operand is a VGPR, otherwise we would violate the + // constant bus restriction. + unsigned OtherIdx = CommuteOpNo == CommuteIdx0 ? CommuteIdx1 : CommuteIdx0; + MachineOperand &OtherOp = MI->getOperand(OtherIdx); + if (!OtherOp.isReg() || + !TII->getRegisterInfo().isVGPR(MRI, OtherOp.getReg())) + return false; + + const MachineOperand &SDst = MI->getOperand(1); + assert(SDst.isDef()); + + // TODO: Handle cases with a used carry. + if (!MRI.use_nodbg_empty(SDst.getReg())) + return false; + + int Op32 = AMDGPU::getVOPe32(Opc); + FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true, + Op32)); + return true; + } + TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1); return false; } - FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold, true)); + FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true)); return true; } @@ -757,7 +831,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, Copy->addImplicitDefUseOperands(*MF); for (FoldCandidate &Fold : FoldList) { - if (updateOperand(Fold, *TRI)) { + if (updateOperand(Fold, *TII, *TRI)) { // Clear kill flags. if (Fold.isReg()) { assert(Fold.OpToFold && Fold.OpToFold->isReg()); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 0954f7f311a..bd6c5ac9d83 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2572,7 +2572,60 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI, // Check output modifiers return !hasModifiersSet(MI, AMDGPU::OpName::omod) && !hasModifiersSet(MI, AMDGPU::OpName::clamp); +} + +// Set VCC operand with all flags from \p Orig, except for setting it as +// implicit. +static void copyFlagsToImplicitVCC(MachineInstr &MI, + const MachineOperand &Orig) { + + for (MachineOperand &Use : MI.implicit_operands()) { + if (Use.isUse() && Use.getReg() == AMDGPU::VCC) { + Use.setIsUndef(Orig.isUndef()); + Use.setIsKill(Orig.isKill()); + return; + } + } +} + +MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI, + unsigned Op32) const { + MachineBasicBlock *MBB = MI.getParent();; + MachineInstrBuilder Inst32 = + BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32)); + + // Add the dst operand if the 32-bit encoding also has an explicit $vdst. + // For VOPC instructions, this is replaced by an implicit def of vcc. + int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst); + if (Op32DstIdx != -1) { + // dst + Inst32.add(MI.getOperand(0)); + } else { + assert(MI.getOperand(0).getReg() == AMDGPU::VCC && + "Unexpected case"); + } + + Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0)); + + const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); + if (Src1) + Inst32.add(*Src1); + + const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); + + if (Src2) { + int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2); + if (Op32Src2Idx != -1) { + Inst32.add(*Src2); + } else { + // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is + // replaced with an implicit read of vcc. This was already added + // during the initial BuildMI, so find it to preserve the flags. + copyFlagsToImplicitVCC(*Inst32, *Src2); + } + } + return Inst32; } bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 1bc2e9d59c6..1a7fc02ef6f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -694,6 +694,9 @@ public: bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const; + MachineInstr *buildShrunkInst(MachineInstr &MI, + unsigned NewOpcode) const; + bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override; diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index a20dafcbb74..d37ad077dd6 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -120,19 +120,6 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, return false; } -// Copy MachineOperand with all flags except setting it as implicit. -static void copyFlagsToImplicitVCC(MachineInstr &MI, - const MachineOperand &Orig) { - - for (MachineOperand &Use : MI.implicit_operands()) { - if (Use.isUse() && Use.getReg() == AMDGPU::VCC) { - Use.setIsUndef(Orig.isUndef()); - Use.setIsKill(Orig.isKill()); - return; - } - } -} - static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) { return isInt<16>(Src.getImm()) && !TII->isInlineConstant(*Src.getParent(), @@ -434,40 +421,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { // We can shrink this instruction LLVM_DEBUG(dbgs() << "Shrinking " << MI); - MachineInstrBuilder Inst32 = - BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32)); - - // Add the dst operand if the 32-bit encoding also has an explicit $vdst. - // For VOPC instructions, this is replaced by an implicit def of vcc. - int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst); - if (Op32DstIdx != -1) { - // dst - Inst32.add(MI.getOperand(0)); - } else { - assert(MI.getOperand(0).getReg() == AMDGPU::VCC && - "Unexpected case"); - } - - - Inst32.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)); - - const MachineOperand *Src1 = - TII->getNamedOperand(MI, AMDGPU::OpName::src1); - if (Src1) - Inst32.add(*Src1); - - if (Src2) { - int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2); - if (Op32Src2Idx != -1) { - Inst32.add(*Src2); - } else { - // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is - // replaced with an implicit read of vcc. This was already added - // during the initial BuildMI, so find it to preserve the flags. - copyFlagsToImplicitVCC(*Inst32, *Src2); - } - } - + MachineInstr *Inst32 = TII->buildShrunkInst(MI, Op32); ++NumInstructionsShrunk; // Copy extra operands not present in the instruction definition. |