summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
diff options
context:
space:
mode:
authorScott Linder <scott@scottlinder.com>2018-10-08 18:47:01 +0000
committerScott Linder <scott@scottlinder.com>2018-10-08 18:47:01 +0000
commit823549a6ec08340d1c2f329fa450f572546175c7 (patch)
tree2dcaa0f4a3e071be89e9d18480f07804e70a46e0 /llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
parent6fc8d055656bf6e278dc50973672594ddd9f5edf (diff)
downloadbcm5719-llvm-823549a6ec08340d1c2f329fa450f572546175c7.tar.gz
bcm5719-llvm-823549a6ec08340d1c2f329fa450f572546175c7.zip
[AMDGPU] Legalize VGPR Rsrc operands for MUBUF instructions
Emit a waterfall loop in the general case for a potentially-divergent Rsrc operand. When practical, avoid this by using Addr64 instructions. Recommits r341413 with changes to update the MachineDominatorTree when present. Differential Revision: https://reviews.llvm.org/D51742 llvm-svn: 343992
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIInstrInfo.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp351
1 files changed, 259 insertions, 92 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index c779dae23ec..61a0030aea2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -31,6 +31,7 @@
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
@@ -3612,7 +3613,191 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
FoldImmediate(*Copy, *Def, OpReg, &MRI);
}
-void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
+// Emit the actual waterfall loop, executing the wrapped instruction for each
+// unique value of \p Rsrc across all lanes. In the best case we execute 1
+// iteration, in the worst case we execute 64 (once per lane).
+static void
+emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
+ MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
+ const DebugLoc &DL, MachineOperand &Rsrc) {
+ MachineBasicBlock::iterator I = LoopBB.begin();
+
+ unsigned VRsrc = Rsrc.getReg();
+ unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef());
+
+ unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned CondReg0 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned CondReg1 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned AndCond = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ unsigned SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ unsigned SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ unsigned SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
+
+ // Beginning of the loop, read the next Rsrc variant.
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub0)
+ .addReg(VRsrc, VRsrcUndef, AMDGPU::sub0);
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub1)
+ .addReg(VRsrc, VRsrcUndef, AMDGPU::sub1);
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub2)
+ .addReg(VRsrc, VRsrcUndef, AMDGPU::sub2);
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub3)
+ .addReg(VRsrc, VRsrcUndef, AMDGPU::sub3);
+
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc)
+ .addReg(SRsrcSub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(SRsrcSub1)
+ .addImm(AMDGPU::sub1)
+ .addReg(SRsrcSub2)
+ .addImm(AMDGPU::sub2)
+ .addReg(SRsrcSub3)
+ .addImm(AMDGPU::sub3);
+
+ // Update Rsrc operand to use the SGPR Rsrc.
+ Rsrc.setReg(SRsrc);
+ Rsrc.setIsKill(true);
+
+ // Identify all lanes with identical Rsrc operands in their VGPRs.
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg0)
+ .addReg(SRsrc, 0, AMDGPU::sub0_sub1)
+ .addReg(VRsrc, 0, AMDGPU::sub0_sub1);
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg1)
+ .addReg(SRsrc, 0, AMDGPU::sub2_sub3)
+ .addReg(VRsrc, 0, AMDGPU::sub2_sub3);
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_B64), AndCond)
+ .addReg(CondReg0)
+ .addReg(CondReg1);
+
+ MRI.setSimpleHint(SaveExec, AndCond);
+
+ // Update EXEC to matching lanes, saving original to SaveExec.
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_SAVEEXEC_B64), SaveExec)
+ .addReg(AndCond, RegState::Kill);
+
+ // The original instruction is here; we insert the terminators after it.
+ I = LoopBB.end();
+
+ // Update EXEC, switch all done bits to 0 and all todo bits to 1.
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC)
+ .addReg(SaveExec);
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB);
+}
+
+// Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register
+// with SGPRs by iterating over all unique values across all lanes.
+static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
+ MachineOperand &Rsrc, MachineDominatorTree *MDT) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ MachineBasicBlock::iterator I(&MI);
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+
+ // Save the EXEC mask
+ BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B64), SaveExec)
+ .addReg(AMDGPU::EXEC);
+
+ // Killed uses in the instruction we are waterfalling around will be
+ // incorrect due to the added control-flow.
+ for (auto &MO : MI.uses()) {
+ if (MO.isReg() && MO.isUse()) {
+ MRI.clearKillFlags(MO.getReg());
+ }
+ }
+
+ // To insert the loop we need to split the block. Move everything after this
+ // point to a new block, and insert a new empty block between the two.
+ MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
+ MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
+ MachineFunction::iterator MBBI(MBB);
+ ++MBBI;
+
+ MF.insert(MBBI, LoopBB);
+ MF.insert(MBBI, RemainderBB);
+
+ LoopBB->addSuccessor(LoopBB);
+ LoopBB->addSuccessor(RemainderBB);
+
+ // Move MI to the LoopBB, and the remainder of the block to RemainderBB.
+ MachineBasicBlock::iterator J = I++;
+ RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
+ RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
+ LoopBB->splice(LoopBB->begin(), &MBB, J);
+
+ MBB.addSuccessor(LoopBB);
+
+ // Update dominators. We know that MBB immediately dominates LoopBB, that
+ // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately
+ // dominates all of the successors transferred to it from MBB that MBB used
+ // to dominate.
+ if (MDT) {
+ MDT->addNewBlock(LoopBB, &MBB);
+ MDT->addNewBlock(RemainderBB, LoopBB);
+ for (auto &Succ : RemainderBB->successors()) {
+ if (MDT->dominates(&MBB, Succ)) {
+ MDT->changeImmediateDominator(Succ, RemainderBB);
+ }
+ }
+ }
+
+ emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc);
+
+ // Restore the EXEC mask
+ MachineBasicBlock::iterator First = RemainderBB->begin();
+ BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+ .addReg(SaveExec);
+}
+
+// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
+static std::tuple<unsigned, unsigned>
+extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ // Extract the ptr from the resource descriptor.
+ unsigned RsrcPtr =
+ TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
+ AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
+
+ // Create an empty resource descriptor
+ unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
+ uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
+
+ // Zero64 = 0
+ BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
+ .addImm(0);
+
+ // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
+ BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
+ .addImm(RsrcDataFormat & 0xFFFFFFFF);
+
+ // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
+ BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
+ .addImm(RsrcDataFormat >> 32);
+
+ // NewSRsrc = {Zero64, SRsrcFormat}
+ BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
+ .addReg(Zero64)
+ .addImm(AMDGPU::sub0_sub1)
+ .addReg(SRsrcFormatLo)
+ .addImm(AMDGPU::sub2)
+ .addReg(SRsrcFormatHi)
+ .addImm(AMDGPU::sub3);
+
+ return std::make_tuple(RsrcPtr, NewSRsrc);
+}
+
+void SIInstrInfo::legalizeOperands(MachineInstr &MI,
+ MachineDominatorTree *MDT) const {
MachineFunction &MF = *MI.getParent()->getParent();
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -3754,75 +3939,56 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
return;
}
- // Legalize MUBUF* instructions by converting to addr64 form.
- // FIXME: If we start using the non-addr64 instructions for compute, we
- // may need to legalize them as above. This especially applies to the
- // buffer_load_format_* variants and variants with idxen (or bothen).
- int SRsrcIdx =
+ // Legalize MUBUF* instructions.
+ int RsrcIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
- if (SRsrcIdx != -1) {
+ if (RsrcIdx != -1) {
// We have an MUBUF instruction
- MachineOperand *SRsrc = &MI.getOperand(SRsrcIdx);
- unsigned SRsrcRC = get(MI.getOpcode()).OpInfo[SRsrcIdx].RegClass;
- if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()),
- RI.getRegClass(SRsrcRC))) {
+ MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
+ unsigned RsrcRC = get(MI.getOpcode()).OpInfo[RsrcIdx].RegClass;
+ if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()),
+ RI.getRegClass(RsrcRC))) {
// The operands are legal.
// FIXME: We may need to legalize operands besided srsrc.
return;
}
- MachineBasicBlock &MBB = *MI.getParent();
-
- // Extract the ptr from the resource descriptor.
- unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc,
- &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
-
- // Create an empty resource descriptor
- unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
- uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
-
- // Zero64 = 0
- BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B64), Zero64)
- .addImm(0);
-
- // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
- BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
- .addImm(RsrcDataFormat & 0xFFFFFFFF);
-
- // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
- BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
- .addImm(RsrcDataFormat >> 32);
+ // Legalize a VGPR Rsrc.
+ //
+ // If the instruction is _ADDR64, we can avoid a waterfall by extracting
+ // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
+ // a zero-value SRsrc.
+ //
+ // If the instruction is _OFFSET (both idxen and offen disabled), and we
+ // support ADDR64 instructions, we can convert to ADDR64 and do the same as
+ // above.
+ //
+ // Otherwise we are on non-ADDR64 hardware, and/or we have
+ // idxen/offen/bothen and we fall back to a waterfall loop.
- // NewSRsrc = {Zero64, SRsrcFormat}
- BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc)
- .addReg(Zero64)
- .addImm(AMDGPU::sub0_sub1)
- .addReg(SRsrcFormatLo)
- .addImm(AMDGPU::sub2)
- .addReg(SRsrcFormatHi)
- .addImm(AMDGPU::sub3);
+ MachineBasicBlock &MBB = *MI.getParent();
MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
- unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
- if (VAddr) {
+ if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
// This is already an ADDR64 instruction so we need to add the pointer
// extracted from the resource descriptor to the current value of VAddr.
unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+
+ unsigned RsrcPtr, NewSRsrc;
+ std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
- // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0
+ // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
DebugLoc DL = MI.getDebugLoc();
BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
- .addReg(SRsrcPtr, 0, AMDGPU::sub0)
- .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
+ .addReg(RsrcPtr, 0, AMDGPU::sub0)
+ .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
- // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1
+ // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi)
- .addReg(SRsrcPtr, 0, AMDGPU::sub1)
- .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
+ .addReg(RsrcPtr, 0, AMDGPU::sub1)
+ .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
// NewVaddr = {NewVaddrHi, NewVaddrLo}
BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
@@ -3830,13 +3996,20 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
.addImm(AMDGPU::sub0)
.addReg(NewVAddrHi)
.addImm(AMDGPU::sub1);
- } else {
+
+ VAddr->setReg(NewVAddr);
+ Rsrc->setReg(NewSRsrc);
+ } else if (!VAddr && ST.hasAddr64()) {
// This instructions is the _OFFSET variant, so we need to convert it to
// ADDR64.
assert(MBB.getParent()->getSubtarget<GCNSubtarget>().getGeneration()
< AMDGPUSubtarget::VOLCANIC_ISLANDS &&
"FIXME: Need to emit flat atomics here");
+ unsigned RsrcPtr, NewSRsrc;
+ std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
+
+ unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
@@ -3852,10 +4025,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
MachineInstrBuilder MIB =
BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
.add(*VData)
- .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
- // This will be replaced later
- // with the new value of vaddr.
- .add(*SRsrc)
+ .addReg(NewVAddr)
+ .addReg(NewSRsrc)
.add(*SOffset)
.add(*Offset);
@@ -3879,10 +4050,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
.add(*VData)
.add(*VDataIn)
- .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
- // This will be replaced later
- // with the new value of vaddr.
- .add(*SRsrc)
+ .addReg(NewVAddr)
+ .addReg(NewSRsrc)
.add(*SOffset)
.add(*Offset)
.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc))
@@ -3894,23 +4063,20 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
// NewVaddr = {NewVaddrHi, NewVaddrLo}
BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
NewVAddr)
- .addReg(SRsrcPtr, 0, AMDGPU::sub0)
+ .addReg(RsrcPtr, 0, AMDGPU::sub0)
.addImm(AMDGPU::sub0)
- .addReg(SRsrcPtr, 0, AMDGPU::sub1)
+ .addReg(RsrcPtr, 0, AMDGPU::sub1)
.addImm(AMDGPU::sub1);
-
- VAddr = getNamedOperand(*Addr64, AMDGPU::OpName::vaddr);
- SRsrc = getNamedOperand(*Addr64, AMDGPU::OpName::srsrc);
+ } else {
+ // This is another variant; legalize Rsrc with waterfall loop from VGPRs
+ // to SGPRs.
+ loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT);
}
-
- // Update the instruction to use NewVaddr
- VAddr->setReg(NewVAddr);
- // Update the instruction to use NewSRsrc
- SRsrc->setReg(NewSRsrc);
}
}
-void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
+void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
+ MachineDominatorTree *MDT) const {
SetVectorType Worklist;
Worklist.insert(&TopInst);
@@ -3928,29 +4094,29 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
break;
case AMDGPU::S_ADD_U64_PSEUDO:
case AMDGPU::S_SUB_U64_PSEUDO:
- splitScalar64BitAddSub(Worklist, Inst);
+ splitScalar64BitAddSub(Worklist, Inst, MDT);
Inst.eraseFromParent();
continue;
case AMDGPU::S_ADD_I32:
case AMDGPU::S_SUB_I32:
// FIXME: The u32 versions currently selected use the carry.
- if (moveScalarAddSub(Worklist, Inst))
+ if (moveScalarAddSub(Worklist, Inst, MDT))
continue;
// Default handling
break;
case AMDGPU::S_AND_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64);
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64, MDT);
Inst.eraseFromParent();
continue;
case AMDGPU::S_OR_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64);
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64, MDT);
Inst.eraseFromParent();
continue;
case AMDGPU::S_XOR_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64);
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64, MDT);
Inst.eraseFromParent();
continue;
@@ -4037,7 +4203,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
continue;
case AMDGPU::S_XNOR_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32);
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
Inst.eraseFromParent();
continue;
@@ -4146,7 +4312,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
// Legalize all operands other than the offset. Notably, convert the srsrc
// into SGPRs using v_readfirstlane if needed.
- legalizeOperands(*NewInstr);
+ legalizeOperands(*NewInstr, MDT);
continue;
}
}
@@ -4154,7 +4320,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
// We cannot move this instruction to the VALU, so we should try to
// legalize its operands instead.
- legalizeOperands(Inst);
+ legalizeOperands(Inst, MDT);
continue;
}
@@ -4243,7 +4409,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
}
// Legalize the operands
- legalizeOperands(Inst);
+ legalizeOperands(Inst, MDT);
if (HasDst)
addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
@@ -4251,8 +4417,8 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
}
// Add/sub require special handling to deal with carry outs.
-bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist,
- MachineInstr &Inst) const {
+bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
+ MachineDominatorTree *MDT) const {
if (ST.hasAddNoCarry()) {
// Assume there is no user of scc since we don't select this in that case.
// Since scc isn't used, it doesn't really matter if the i32 or u32 variant
@@ -4276,7 +4442,7 @@ bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist,
Inst.setDesc(get(NewOpc));
Inst.addImplicitDefUseOperands(*MBB.getParent());
MRI.replaceRegWith(OldDstReg, ResultReg);
- legalizeOperands(Inst);
+ legalizeOperands(Inst, MDT);
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
return true;
@@ -4396,8 +4562,9 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
}
-void SIInstrInfo::splitScalar64BitAddSub(
- SetVectorType &Worklist, MachineInstr &Inst) const {
+void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
+ MachineInstr &Inst,
+ MachineDominatorTree *MDT) const {
bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
MachineBasicBlock &MBB = *Inst.getParent();
@@ -4457,16 +4624,16 @@ void SIInstrInfo::splitScalar64BitAddSub(
// Try to legalize the operands in case we need to swap the order to keep it
// valid.
- legalizeOperands(*LoHalf);
- legalizeOperands(*HiHalf);
+ legalizeOperands(*LoHalf, MDT);
+ legalizeOperands(*HiHalf, MDT);
// Move all users of this moved vlaue.
addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
}
-void SIInstrInfo::splitScalar64BitBinaryOp(
- SetVectorType &Worklist, MachineInstr &Inst,
- unsigned Opcode) const {
+void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
+ MachineInstr &Inst, unsigned Opcode,
+ MachineDominatorTree *MDT) const {
MachineBasicBlock &MBB = *Inst.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -4524,8 +4691,8 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
// Try to legalize the operands in case we need to swap the order to keep it
// valid.
- legalizeOperands(LoHalf);
- legalizeOperands(HiHalf);
+ legalizeOperands(LoHalf, MDT);
+ legalizeOperands(HiHalf, MDT);
// Move all users of this moved vlaue.
addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
OpenPOWER on IntegriCloud