summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
diff options
context:
space:
mode:
authorRon Lieberman <ronlieb.g@gmail.com>2018-12-03 13:04:54 +0000
committerRon Lieberman <ronlieb.g@gmail.com>2018-12-03 13:04:54 +0000
commit16de4fd2ebac7621c50a621cd6e693cc2bbcef2b (patch)
tree16bbc03bd81aabe71a1fcee39e7b18c08eb55974 /llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
parent0b639da54e6a1d324be97fdf1f87f284824ebb52 (diff)
downloadbcm5719-llvm-16de4fd2ebac7621c50a621cd6e693cc2bbcef2b.tar.gz
bcm5719-llvm-16de4fd2ebac7621c50a621cd6e693cc2bbcef2b.zip
[AMDGPU] Add sdwa support for ADD|SUB U64 decomposed Pseudos
The introduction of S_{ADD|SUB}_U64_PSEUDO instructions which are decomposed into VOP3 instruction pairs for S_ADD_U64_PSEUDO: V_ADD_I32_e64 V_ADDC_U32_e64 and for S_SUB_U64_PSEUDO V_SUB_I32_e64 V_SUBB_U32_e64 preclude the use of SDWA to encode a constant. SDWA: Sub-Dword addressing is supported on VOP1 and VOP2 instructions, but not on VOP3 instructions. We desire to fold the bit-and operand into the instruction encoding for the V_ADD_I32 instruction. This requires that we transform the VOP3 into a VOP2 form of the instruction (_e32). %19:vgpr_32 = V_AND_B32_e32 255, killed %16:vgpr_32, implicit $exec %47:vgpr_32, %49:sreg_64_xexec = V_ADD_I32_e64 %26.sub0:vreg_64, %19:vgpr_32, implicit $exec %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec which then allows the SDWA encoding and becomes %47:vgpr_32 = V_ADD_I32_sdwa 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0, implicit-def $vcc, implicit $exec %48:vgpr_32 = V_ADDC_U32_e32 0, %26.sub1:vreg_64, implicit-def $vcc, implicit $vcc, implicit $exec Differential Revision: https://reviews.llvm.org/D54882 llvm-svn: 348132
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp97
1 files changed, 95 insertions, 2 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 0e000b72962..2d43d5d05ef 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -90,7 +90,9 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
void matchSDWAOperands(MachineBasicBlock &MBB);
std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
- bool isConvertibleToSDWA(const MachineInstr &MI, const GCNSubtarget &ST) const;
+ bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const;
+ void pseudoOpConvertToVOP2(MachineInstr &MI,
+ const GCNSubtarget &ST) const;
bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
@@ -854,7 +856,82 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
}
}
-bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI,
+// Convert the V_ADDC_U32_e64 into V_ADDC_U32_e32, and
+// V_ADD_I32_e64 into V_ADD_I32_e32. This allows isConvertibleToSDWA
+// to perform its transformation on V_ADD_I32_e32 into V_ADD_I32_sdwa.
+//
+// We are transforming from a VOP3 into a VOP2 form of the instruction.
+// %19:vgpr_32 = V_AND_B32_e32 255,
+// killed %16:vgpr_32, implicit $exec
+// %47:vgpr_32, %49:sreg_64_xexec = V_ADD_I32_e64
+// %26.sub0:vreg_64, %19:vgpr_32, implicit $exec
+// %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
+// %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec
+//
+// becomes
+// %47:vgpr_32 = V_ADD_I32_sdwa
+// 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0,
+// implicit-def $vcc, implicit $exec
+// %48:vgpr_32 = V_ADDC_U32_e32
+// 0, %26.sub1:vreg_64, implicit-def $vcc, implicit $vcc, implicit $exec
+void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
+ const GCNSubtarget &ST) const {
+ int Opc = MI.getOpcode();
+ assert((Opc == AMDGPU::V_ADD_I32_e64 || Opc == AMDGPU::V_SUB_I32_e64) &&
+ "Currently only handles V_ADD_I32_e64 or V_SUB_I32_e64");
+
+ // Can the candidate MI be shrunk?
+ if (!TII->canShrink(MI, *MRI))
+ return;
+ Opc = AMDGPU::getVOPe32(Opc);
+ // Find the related ADD instruction.
+ const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
+ if (!Sdst)
+ return;
+ MachineOperand *NextOp = findSingleRegUse(Sdst, MRI);
+ if (!NextOp)
+ return;
+ MachineInstr &MISucc = *NextOp->getParent();
+ // Can the successor be shrunk?
+ if (!TII->canShrink(MISucc, *MRI))
+ return;
+ int SuccOpc = AMDGPU::getVOPe32(MISucc.getOpcode());
+ // Make sure the carry in/out are subsequently unused.
+ MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2);
+ if (!CarryIn)
+ return;
+ MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst);
+ if (!CarryOut)
+ return;
+ if (!MRI->hasOneUse(CarryIn->getReg()) || !MRI->use_empty(CarryOut->getReg()))
+ return;
+ // Make sure VCC or its subregs are dead before MI.
+ MachineBasicBlock &MBB = *MI.getParent();
+ auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25);
+ if (Liveness != MachineBasicBlock::LQR_Dead)
+ return;
+ // Check if VCC is referenced in range of (MI,MISucc].
+ for (auto I = std::next(MI.getIterator()), E = MISucc.getIterator();
+ I != E; ++I) {
+ if (I->modifiesRegister(AMDGPU::VCC, TRI))
+ return;
+ }
+ // Make the two new e32 instruction variants.
+ // Replace MI with V_{SUB|ADD}_I32_e32
+ auto NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc));
+ NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst));
+ NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
+ NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1));
+ MI.eraseFromParent();
+ // Replace MISucc with V_{SUBB|ADDC}_U32_e32
+ auto NewInst = BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc));
+ NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst));
+ NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0));
+ NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1));
+ MISucc.eraseFromParent();
+}
+
+bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
const GCNSubtarget &ST) const {
// Check if this is already an SDWA instruction
unsigned Opc = MI.getOpcode();
@@ -1127,6 +1204,22 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
for (MachineBasicBlock &MBB : MF) {
bool Changed = false;
do {
+ // Preprocess the ADD/SUB pairs so they could be SDWA'ed.
+ // Look for a possible ADD or SUB that resulted from a previously lowered
+ // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2
+ // lowers the pair of instructions into e32 form.
+ matchSDWAOperands(MBB);
+ for (const auto &OperandPair : SDWAOperands) {
+ const auto &Operand = OperandPair.second;
+ MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
+ if (PotentialMI &&
+ (PotentialMI->getOpcode() == AMDGPU::V_ADD_I32_e64 ||
+ PotentialMI->getOpcode() == AMDGPU::V_SUB_I32_e64))
+ pseudoOpConvertToVOP2(*PotentialMI, ST);
+ }
+ SDWAOperands.clear();
+
+ // Generate potential match list.
matchSDWAOperands(MBB);
for (const auto &OperandPair : SDWAOperands) {
OpenPOWER on IntegriCloud