diff options
| author | Nicolai Haehnle <nhaehnle@gmail.com> | 2018-11-30 22:55:38 +0000 |
|---|---|---|
| committer | Nicolai Haehnle <nhaehnle@gmail.com> | 2018-11-30 22:55:38 +0000 |
| commit | a7b00058e05f6862d4ef2c8f8bb287b09f7e41b1 (patch) | |
| tree | 3f571b7d7ba5368d8ca4dc8010ef04ffe0ee6eef /llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | |
| parent | a9cc92c247ce5d0ecc3399e7af6e40a3d59bbf6c (diff) | |
| download | bcm5719-llvm-a7b00058e05f6862d4ef2c8f8bb287b09f7e41b1.tar.gz bcm5719-llvm-a7b00058e05f6862d4ef2c8f8bb287b09f7e41b1.zip | |
AMDGPU: Divergence-driven selection of scalar buffer load intrinsics
Summary:
Moving SMRD to VMEM in SIFixSGPRCopies is rather bad for performance if
the load is really uniform. So select the scalar load intrinsics directly
to either VMEM or SMRD buffer loads based on divergence analysis.
If an offset happens to end up in a VGPR -- either because a floating
point calculation was involved, or due to other remaining deficiencies
in SIFixSGPRCopies -- we use v_readfirstlane.
There is some unrelated churn in tests since we now select MUBUF offsets
in a unified way with non-scalar buffer loads.
Change-Id: I170e6816323beb1348677b358c9d380865cd1a19
Reviewers: arsenm, alex-t, rampitec, tpr
Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D53283
llvm-svn: 348050
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIInstrInfo.cpp')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 185 |
1 files changed, 7 insertions, 178 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 902ed3bf627..d4e47c63dca 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3602,8 +3602,13 @@ void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, // pointer value is uniform. MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { - unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); - SBase->setReg(SGPR); + unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); + SBase->setReg(SGPR); + } + MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff); + if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) { + unsigned SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI); + SOff->setReg(SGPR); } } @@ -4272,115 +4277,6 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst, splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32); Inst.eraseFromParent(); continue; - - case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: - case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: - case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: - case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: - case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: { - unsigned VDst; - unsigned NewOpcode; - - switch(Opcode) { - case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: - NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_OFFEN; - VDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - break; - case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: - NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; - VDst = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); - break; - case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: - NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN; - VDst = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass); - break; - case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: - case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: - splitScalarBuffer(Worklist, Inst); - Inst.eraseFromParent(); - continue; - } - - const MachineOperand *VAddr = getNamedOperand(Inst, AMDGPU::OpName::soff); - auto Add = MRI.getUniqueVRegDef(VAddr->getReg()); - unsigned Offset = 0; - - // FIXME: This isn't safe because the addressing mode doesn't work - // correctly if vaddr is negative. - // - // FIXME: Should probably be done somewhere else, maybe SIFoldOperands. - // - // See if we can extract an immediate offset by recognizing one of these: - // V_ADD_I32_e32 dst, imm, src1 - // V_ADD_I32_e32 dst, (S_MOV_B32 imm), src1 - // V_ADD will be removed by "Remove dead machine instructions". - if (Add && - (Add->getOpcode() == AMDGPU::V_ADD_I32_e32 || - Add->getOpcode() == AMDGPU::V_ADD_U32_e32 || - Add->getOpcode() == AMDGPU::V_ADD_U32_e64)) { - static const unsigned SrcNames[2] = { - AMDGPU::OpName::src0, - AMDGPU::OpName::src1, - }; - - // Find a literal offset in one of source operands. - for (int i = 0; i < 2; i++) { - const MachineOperand *Src = - getNamedOperand(*Add, SrcNames[i]); - - if (Src->isReg()) { - MachineInstr *Def = MRI.getUniqueVRegDef(Src->getReg()); - if (Def) { - if (Def->isMoveImmediate()) - Src = &Def->getOperand(1); - else if (Def->isCopy()) { - auto Mov = MRI.getUniqueVRegDef(Def->getOperand(1).getReg()); - if (Mov && Mov->isMoveImmediate()) { - Src = &Mov->getOperand(1); - } - } - } - } - - if (Src) { - if (Src->isImm()) - Offset = Src->getImm(); - else if (Src->isCImm()) - Offset = Src->getCImm()->getZExtValue(); - } - - if (Offset && isLegalMUBUFImmOffset(Offset)) { - VAddr = getNamedOperand(*Add, SrcNames[!i]); - break; - } - - Offset = 0; - } - } - - MachineInstr *NewInstr = - BuildMI(*MBB, Inst, Inst.getDebugLoc(), - get(NewOpcode), VDst) - .add(*VAddr) // vaddr - .add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc - .addImm(0) // soffset - .addImm(Offset) // offset - .addImm(getNamedOperand(Inst, AMDGPU::OpName::glc)->getImm()) - .addImm(0) // slc - .addImm(0) // tfe - .cloneMemRefs(Inst) - .getInstr(); - - MRI.replaceRegWith(getNamedOperand(Inst, AMDGPU::OpName::sdst)->getReg(), - VDst); - addUsersToMoveToVALUWorklist(VDst, MRI, Worklist); - Inst.eraseFromParent(); - - // Legalize all operands other than the offset. Notably, convert the srsrc - // into SGPRs using v_readfirstlane if needed. - legalizeOperands(*NewInstr, MDT); - continue; - } } if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { @@ -4955,73 +4851,6 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } -void SIInstrInfo::splitScalarBuffer(SetVectorType &Worklist, - MachineInstr &Inst) const { - MachineBasicBlock &MBB = *Inst.getParent(); - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - - MachineBasicBlock::iterator MII = Inst; - auto &DL = Inst.getDebugLoc(); - - MachineOperand &Dest = *getNamedOperand(Inst, AMDGPU::OpName::sdst);; - MachineOperand &Rsrc = *getNamedOperand(Inst, AMDGPU::OpName::sbase); - MachineOperand &Offset = *getNamedOperand(Inst, AMDGPU::OpName::soff); - MachineOperand &Glc = *getNamedOperand(Inst, AMDGPU::OpName::glc); - - unsigned Opcode = Inst.getOpcode(); - unsigned NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN; - unsigned Count = 0; - const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); - const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); - - switch(Opcode) { - default: - return; - case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: - Count = 2; - break; - case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: - Count = 4; - break; - } - - // FIXME: Should also attempt to build VAddr and Offset like the non-split - // case (see call site for this function) - - // Create a vector of result registers - SmallVector<unsigned, 8> ResultRegs; - for (unsigned i = 0; i < Count ; ++i) { - unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass); - MachineInstr &NewMI = *BuildMI(MBB, MII, DL, get(NewOpcode), ResultReg) - .addReg(Offset.getReg()) // offset - .addReg(Rsrc.getReg()) // rsrc - .addImm(0) // soffset - .addImm(i << 4) // inst_offset - .addImm(Glc.getImm()) // glc - .addImm(0) // slc - .addImm(0) // tfe - .addMemOperand(*Inst.memoperands_begin()); - // Extract the 4 32 bit sub-registers from the result to add into the final REG_SEQUENCE - auto &NewDestOp = NewMI.getOperand(0); - for (unsigned i = 0 ; i < 4 ; i++) - ResultRegs.push_back(buildExtractSubReg(MII, MRI, NewDestOp, &AMDGPU::VReg_128RegClass, - RI.getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass)); - } - // Create a new combined result to replace original with - unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); - MachineInstrBuilder CombinedResBuilder = BuildMI(MBB, MII, DL, - get(TargetOpcode::REG_SEQUENCE), FullDestReg); - - for (unsigned i = 0 ; i < Count * 4 ; ++i) { - CombinedResBuilder - .addReg(ResultRegs[i]) - .addImm(RI.getSubRegFromChannel(i)); - } - - MRI.replaceRegWith(Dest.getReg(), FullDestReg); - addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); -} - void SIInstrInfo::addUsersToMoveToVALUWorklist( unsigned DstReg, MachineRegisterInfo &MRI, |

