diff options
| author | Nicolai Haehnle <nhaehnle@gmail.com> | 2018-11-07 21:53:43 +0000 |
|---|---|---|
| committer | Nicolai Haehnle <nhaehnle@gmail.com> | 2018-11-07 21:53:43 +0000 |
| commit | bc233f5523f43de34788179a11e390aaa69c8209 (patch) | |
| tree | 8521369469d3c3e2a52afe36233671a4cfb8d1c8 /llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | |
| parent | 61396ff67c05863e63264431d209c6a91ff7dc53 (diff) | |
| download | bcm5719-llvm-bc233f5523f43de34788179a11e390aaa69c8209.tar.gz bcm5719-llvm-bc233f5523f43de34788179a11e390aaa69c8209.zip | |
Revert "AMDGPU: Divergence-driven selection of scalar buffer load intrinsics"
This reverts commit r344696 for now (except for some test additions).
See https://bugs.freedesktop.org/show_bug.cgi?id=108611.
llvm-svn: 346364
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIInstrInfo.cpp')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 185 |
1 files changed, 178 insertions, 7 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 4dd06df1233..562428ef37c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3558,13 +3558,8 @@ void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, // pointer value is uniform. MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { - unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); - SBase->setReg(SGPR); - } - MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff); - if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) { - unsigned SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI); - SOff->setReg(SGPR); + unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); + SBase->setReg(SGPR); } } @@ -4193,6 +4188,115 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst, splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT); Inst.eraseFromParent(); continue; + + case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: { + unsigned VDst; + unsigned NewOpcode; + + switch(Opcode) { + case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: + NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_OFFEN; + VDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + break; + case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: + NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; + VDst = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + break; + case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: + NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN; + VDst = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass); + break; + case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: + splitScalarBuffer(Worklist, Inst); + Inst.eraseFromParent(); + continue; + } + + const MachineOperand *VAddr = getNamedOperand(Inst, AMDGPU::OpName::soff); + auto Add = MRI.getUniqueVRegDef(VAddr->getReg()); + unsigned Offset = 0; + + // FIXME: This isn't safe because the addressing mode doesn't work + // correctly if vaddr is negative. + // + // FIXME: Should probably be done somewhere else, maybe SIFoldOperands. + // + // See if we can extract an immediate offset by recognizing one of these: + // V_ADD_I32_e32 dst, imm, src1 + // V_ADD_I32_e32 dst, (S_MOV_B32 imm), src1 + // V_ADD will be removed by "Remove dead machine instructions". + if (Add && + (Add->getOpcode() == AMDGPU::V_ADD_I32_e32 || + Add->getOpcode() == AMDGPU::V_ADD_U32_e32 || + Add->getOpcode() == AMDGPU::V_ADD_U32_e64)) { + static const unsigned SrcNames[2] = { + AMDGPU::OpName::src0, + AMDGPU::OpName::src1, + }; + + // Find a literal offset in one of source operands. + for (int i = 0; i < 2; i++) { + const MachineOperand *Src = + getNamedOperand(*Add, SrcNames[i]); + + if (Src->isReg()) { + MachineInstr *Def = MRI.getUniqueVRegDef(Src->getReg()); + if (Def) { + if (Def->isMoveImmediate()) + Src = &Def->getOperand(1); + else if (Def->isCopy()) { + auto Mov = MRI.getUniqueVRegDef(Def->getOperand(1).getReg()); + if (Mov && Mov->isMoveImmediate()) { + Src = &Mov->getOperand(1); + } + } + } + } + + if (Src) { + if (Src->isImm()) + Offset = Src->getImm(); + else if (Src->isCImm()) + Offset = Src->getCImm()->getZExtValue(); + } + + if (Offset && isLegalMUBUFImmOffset(Offset)) { + VAddr = getNamedOperand(*Add, SrcNames[!i]); + break; + } + + Offset = 0; + } + } + + MachineInstr *NewInstr = + BuildMI(*MBB, Inst, Inst.getDebugLoc(), + get(NewOpcode), VDst) + .add(*VAddr) // vaddr + .add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc + .addImm(0) // soffset + .addImm(Offset) // offset + .addImm(getNamedOperand(Inst, AMDGPU::OpName::glc)->getImm()) + .addImm(0) // slc + .addImm(0) // tfe + .cloneMemRefs(Inst) + .getInstr(); + + MRI.replaceRegWith(getNamedOperand(Inst, AMDGPU::OpName::sdst)->getReg(), + VDst); + addUsersToMoveToVALUWorklist(VDst, MRI, Worklist); + Inst.eraseFromParent(); + + // Legalize all operands other than the offset. Notably, convert the srsrc + // into SGPRs using v_readfirstlane if needed. + legalizeOperands(*NewInstr, MDT); + continue; + } } if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { @@ -4674,6 +4778,73 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } +void SIInstrInfo::splitScalarBuffer(SetVectorType &Worklist, + MachineInstr &Inst) const { + MachineBasicBlock &MBB = *Inst.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + MachineBasicBlock::iterator MII = Inst; + auto &DL = Inst.getDebugLoc(); + + MachineOperand &Dest = *getNamedOperand(Inst, AMDGPU::OpName::sdst);; + MachineOperand &Rsrc = *getNamedOperand(Inst, AMDGPU::OpName::sbase); + MachineOperand &Offset = *getNamedOperand(Inst, AMDGPU::OpName::soff); + MachineOperand &Glc = *getNamedOperand(Inst, AMDGPU::OpName::glc); + + unsigned Opcode = Inst.getOpcode(); + unsigned NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN; + unsigned Count = 0; + const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); + const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); + + switch(Opcode) { + default: + return; + case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: + Count = 2; + break; + case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: + Count = 4; + break; + } + + // FIXME: Should also attempt to build VAddr and Offset like the non-split + // case (see call site for this function) + + // Create a vector of result registers + SmallVector<unsigned, 8> ResultRegs; + for (unsigned i = 0; i < Count ; ++i) { + unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass); + MachineInstr &NewMI = *BuildMI(MBB, MII, DL, get(NewOpcode), ResultReg) + .addReg(Offset.getReg()) // offset + .addReg(Rsrc.getReg()) // rsrc + .addImm(0) // soffset + .addImm(i << 4) // inst_offset + .addImm(Glc.getImm()) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addMemOperand(*Inst.memoperands_begin()); + // Extract the 4 32 bit sub-registers from the result to add into the final REG_SEQUENCE + auto &NewDestOp = NewMI.getOperand(0); + for (unsigned i = 0 ; i < 4 ; i++) + ResultRegs.push_back(buildExtractSubReg(MII, MRI, NewDestOp, &AMDGPU::VReg_128RegClass, + RI.getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass)); + } + // Create a new combined result to replace original with + unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); + MachineInstrBuilder CombinedResBuilder = BuildMI(MBB, MII, DL, + get(TargetOpcode::REG_SEQUENCE), FullDestReg); + + for (unsigned i = 0 ; i < Count * 4 ; ++i) { + CombinedResBuilder + .addReg(ResultRegs[i]) + .addImm(RI.getSubRegFromChannel(i)); + } + + MRI.replaceRegWith(Dest.getReg(), FullDestReg); + addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); +} + void SIInstrInfo::addUsersToMoveToVALUWorklist( unsigned DstReg, MachineRegisterInfo &MRI, |

