summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
diff options
context:
space:
mode:
authorNicolai Haehnle <nhaehnle@gmail.com>2018-11-07 21:53:43 +0000
committerNicolai Haehnle <nhaehnle@gmail.com>2018-11-07 21:53:43 +0000
commitbc233f5523f43de34788179a11e390aaa69c8209 (patch)
tree8521369469d3c3e2a52afe36233671a4cfb8d1c8 /llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
parent61396ff67c05863e63264431d209c6a91ff7dc53 (diff)
downloadbcm5719-llvm-bc233f5523f43de34788179a11e390aaa69c8209.tar.gz
bcm5719-llvm-bc233f5523f43de34788179a11e390aaa69c8209.zip
Revert "AMDGPU: Divergence-driven selection of scalar buffer load intrinsics"
This reverts commit r344696 for now (except for some test additions). See https://bugs.freedesktop.org/show_bug.cgi?id=108611. llvm-svn: 346364
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIInstrInfo.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp185
1 files changed, 178 insertions, 7 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 4dd06df1233..562428ef37c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3558,13 +3558,8 @@ void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
// pointer value is uniform.
MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
- unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
- SBase->setReg(SGPR);
- }
- MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff);
- if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
- unsigned SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
- SOff->setReg(SGPR);
+ unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
+ SBase->setReg(SGPR);
}
}
@@ -4193,6 +4188,115 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
Inst.eraseFromParent();
continue;
+
+ case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: {
+ unsigned VDst;
+ unsigned NewOpcode;
+
+ switch(Opcode) {
+ case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
+ NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_OFFEN;
+ VDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ break;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
+ NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
+ VDst = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ break;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
+ NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
+ VDst = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass);
+ break;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR:
+ splitScalarBuffer(Worklist, Inst);
+ Inst.eraseFromParent();
+ continue;
+ }
+
+ const MachineOperand *VAddr = getNamedOperand(Inst, AMDGPU::OpName::soff);
+ auto Add = MRI.getUniqueVRegDef(VAddr->getReg());
+ unsigned Offset = 0;
+
+ // FIXME: This isn't safe because the addressing mode doesn't work
+ // correctly if vaddr is negative.
+ //
+ // FIXME: Should probably be done somewhere else, maybe SIFoldOperands.
+ //
+ // See if we can extract an immediate offset by recognizing one of these:
+ // V_ADD_I32_e32 dst, imm, src1
+ // V_ADD_I32_e32 dst, (S_MOV_B32 imm), src1
+ // V_ADD will be removed by "Remove dead machine instructions".
+ if (Add &&
+ (Add->getOpcode() == AMDGPU::V_ADD_I32_e32 ||
+ Add->getOpcode() == AMDGPU::V_ADD_U32_e32 ||
+ Add->getOpcode() == AMDGPU::V_ADD_U32_e64)) {
+ static const unsigned SrcNames[2] = {
+ AMDGPU::OpName::src0,
+ AMDGPU::OpName::src1,
+ };
+
+ // Find a literal offset in one of source operands.
+ for (int i = 0; i < 2; i++) {
+ const MachineOperand *Src =
+ getNamedOperand(*Add, SrcNames[i]);
+
+ if (Src->isReg()) {
+ MachineInstr *Def = MRI.getUniqueVRegDef(Src->getReg());
+ if (Def) {
+ if (Def->isMoveImmediate())
+ Src = &Def->getOperand(1);
+ else if (Def->isCopy()) {
+ auto Mov = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
+ if (Mov && Mov->isMoveImmediate()) {
+ Src = &Mov->getOperand(1);
+ }
+ }
+ }
+ }
+
+ if (Src) {
+ if (Src->isImm())
+ Offset = Src->getImm();
+ else if (Src->isCImm())
+ Offset = Src->getCImm()->getZExtValue();
+ }
+
+ if (Offset && isLegalMUBUFImmOffset(Offset)) {
+ VAddr = getNamedOperand(*Add, SrcNames[!i]);
+ break;
+ }
+
+ Offset = 0;
+ }
+ }
+
+ MachineInstr *NewInstr =
+ BuildMI(*MBB, Inst, Inst.getDebugLoc(),
+ get(NewOpcode), VDst)
+ .add(*VAddr) // vaddr
+ .add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc
+ .addImm(0) // soffset
+ .addImm(Offset) // offset
+ .addImm(getNamedOperand(Inst, AMDGPU::OpName::glc)->getImm())
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .cloneMemRefs(Inst)
+ .getInstr();
+
+ MRI.replaceRegWith(getNamedOperand(Inst, AMDGPU::OpName::sdst)->getReg(),
+ VDst);
+ addUsersToMoveToVALUWorklist(VDst, MRI, Worklist);
+ Inst.eraseFromParent();
+
+ // Legalize all operands other than the offset. Notably, convert the srsrc
+ // into SGPRs using v_readfirstlane if needed.
+ legalizeOperands(*NewInstr, MDT);
+ continue;
+ }
}
if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
@@ -4674,6 +4778,73 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
}
+void SIInstrInfo::splitScalarBuffer(SetVectorType &Worklist,
+ MachineInstr &Inst) const {
+ MachineBasicBlock &MBB = *Inst.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+ MachineBasicBlock::iterator MII = Inst;
+ auto &DL = Inst.getDebugLoc();
+
+ MachineOperand &Dest = *getNamedOperand(Inst, AMDGPU::OpName::sdst);;
+ MachineOperand &Rsrc = *getNamedOperand(Inst, AMDGPU::OpName::sbase);
+ MachineOperand &Offset = *getNamedOperand(Inst, AMDGPU::OpName::soff);
+ MachineOperand &Glc = *getNamedOperand(Inst, AMDGPU::OpName::glc);
+
+ unsigned Opcode = Inst.getOpcode();
+ unsigned NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
+ unsigned Count = 0;
+ const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
+ const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
+
+ switch(Opcode) {
+ default:
+ return;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
+ Count = 2;
+ break;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR:
+ Count = 4;
+ break;
+ }
+
+ // FIXME: Should also attempt to build VAddr and Offset like the non-split
+ // case (see call site for this function)
+
+ // Create a vector of result registers
+ SmallVector<unsigned, 8> ResultRegs;
+ for (unsigned i = 0; i < Count ; ++i) {
+ unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass);
+ MachineInstr &NewMI = *BuildMI(MBB, MII, DL, get(NewOpcode), ResultReg)
+ .addReg(Offset.getReg()) // offset
+ .addReg(Rsrc.getReg()) // rsrc
+ .addImm(0) // soffset
+ .addImm(i << 4) // inst_offset
+ .addImm(Glc.getImm()) // glc
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .addMemOperand(*Inst.memoperands_begin());
+ // Extract the 4 32 bit sub-registers from the result to add into the final REG_SEQUENCE
+ auto &NewDestOp = NewMI.getOperand(0);
+ for (unsigned i = 0 ; i < 4 ; i++)
+ ResultRegs.push_back(buildExtractSubReg(MII, MRI, NewDestOp, &AMDGPU::VReg_128RegClass,
+ RI.getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass));
+ }
+ // Create a new combined result to replace original with
+ unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
+ MachineInstrBuilder CombinedResBuilder = BuildMI(MBB, MII, DL,
+ get(TargetOpcode::REG_SEQUENCE), FullDestReg);
+
+ for (unsigned i = 0 ; i < Count * 4 ; ++i) {
+ CombinedResBuilder
+ .addReg(ResultRegs[i])
+ .addImm(RI.getSubRegFromChannel(i));
+ }
+
+ MRI.replaceRegWith(Dest.getReg(), FullDestReg);
+ addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
+}
+
void SIInstrInfo::addUsersToMoveToVALUWorklist(
unsigned DstReg,
MachineRegisterInfo &MRI,
OpenPOWER on IntegriCloud