diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 275 |
1 files changed, 268 insertions, 7 deletions
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 24769a89b9b..1d64bf892ed 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -99,6 +99,8 @@ enum InstClassEnum { BUFFER_LOAD, BUFFER_STORE, MIMG, + TBUFFER_LOAD, + TBUFFER_STORE, }; enum RegisterEnum { @@ -119,6 +121,8 @@ class SILoadStoreOptimizer : public MachineFunctionPass { unsigned Offset1; unsigned Width0; unsigned Width1; + unsigned Format0; + unsigned Format1; unsigned BaseOff; unsigned DMask0; unsigned DMask1; @@ -206,12 +210,14 @@ private: const GCNSubtarget *STM = nullptr; const SIInstrInfo *TII = nullptr; const SIRegisterInfo *TRI = nullptr; + const MCSubtargetInfo *STI = nullptr; MachineRegisterInfo *MRI = nullptr; AliasAnalysis *AA = nullptr; bool OptimizeAgain; - static bool dmasksCanBeCombined(const CombineInfo &CI, const SIInstrInfo &TII); - static bool offsetsCanBeCombined(CombineInfo &CI); + static bool dmasksCanBeCombined(const CombineInfo &CI, + const SIInstrInfo &TII); + static bool offsetsCanBeCombined(CombineInfo &CI, const MCSubtargetInfo &STI); static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI); static unsigned getNewOpcode(const CombineInfo &CI); static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI); @@ -230,6 +236,8 @@ private: MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI); MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI); MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI); + MachineBasicBlock::iterator mergeTBufferLoadPair(CombineInfo &CI); + MachineBasicBlock::iterator mergeTBufferStorePair(CombineInfo &CI); void updateBaseAndOffset(MachineInstr &I, unsigned NewBase, int32_t NewOffset) const; @@ -285,6 +293,9 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); return countPopulation(DMaskImm); } + if (TII.isMTBUF(Opc)) { + return AMDGPU::getMTBUFElements(Opc); + } switch (Opc) { case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: @@ -323,10 +334,27 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1) return UNKNOWN; // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. - if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || TII.isGather4(Opc)) + if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || + TII.isGather4(Opc)) return UNKNOWN; return MIMG; } + if (TII.isMTBUF(Opc)) { + switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { + default: + return UNKNOWN; + case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: + case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: + case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: + case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: + return TBUFFER_LOAD; + case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: + case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: + case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: + case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: + return TBUFFER_STORE; + } + } return UNKNOWN; case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: @@ -357,6 +385,8 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { assert(Info); return Info->BaseOpcode; } + if (TII.isMTBUF(Opc)) + return AMDGPU::getMTBUFBaseOpcode(Opc); return -1; case AMDGPU::DS_READ_B32: case AMDGPU::DS_READ_B32_gfx9: @@ -398,6 +428,24 @@ static unsigned getRegs(unsigned Opc, const SIInstrInfo &TII) { const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) result |= SSAMP; + + return result; + } + if (TII.isMTBUF(Opc)) { + unsigned result = 0; + + if (AMDGPU::getMTBUFHasVAddr(Opc)) { + result |= VADDR; + } + + if (AMDGPU::getMTBUFHasSrsrc(Opc)) { + result |= SRSRC; + } + + if (AMDGPU::getMTBUFHasSoffset(Opc)) { + result |= SOFFSET; + } + return result; } @@ -420,7 +468,6 @@ static unsigned getRegs(unsigned Opc, const SIInstrInfo &TII) { } } - void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII, const GCNSubtarget &STM) { @@ -457,6 +504,9 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, Offset0 = I->getOperand(OffsetIdx).getImm(); } + if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) + Format0 = TII.getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); + Width0 = getOpcodeWidth(*I, TII); if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { @@ -518,6 +568,9 @@ void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI Offset1 = Paired->getOperand(OffsetIdx).getImm(); } + if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) + Format1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::format)->getImm(); + Width1 = getOpcodeWidth(*Paired, TII); if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { Offset1 &= 0xffff; @@ -530,7 +583,6 @@ void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI } } - } // end anonymous namespace. INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, @@ -671,7 +723,33 @@ bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, const SIIn return true; } -bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { +static unsigned getBufferFormatWithCompCount(unsigned OldFormat, + unsigned ComponentCount, + const MCSubtargetInfo &STI) { + if (ComponentCount > 4) + return 0; + + const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = + llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); + if (!OldFormatInfo) + return 0; + + const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = + llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, + ComponentCount, + OldFormatInfo->NumFormat, STI); + + if (!NewFormatInfo) + return 0; + + assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && + NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); + + return NewFormatInfo->Format; +} + +bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, + const MCSubtargetInfo &STI) { assert(CI.InstClass != MIMG); // XXX - Would the same offset be OK? Is there any reason this would happen or @@ -683,6 +761,30 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0)) return false; + if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { + + const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = + llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format0, STI); + if (!Info0) + return false; + const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = + llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format1, STI); + if (!Info1) + return false; + + if (Info0->BitsPerComp != Info1->BitsPerComp || + Info0->NumFormat != Info1->NumFormat) + return false; + + // TODO: Should be possible to support more formats, but if format loads + // are not dword-aligned, the merged load might not be valid. + if (Info0->BitsPerComp != 32) + return false; + + if (getBufferFormatWithCompCount(CI.Format0, CI.Width0 + CI.Width1, STI) == 0) + return false; + } + unsigned EltOffset0 = CI.Offset0 / CI.EltSize; unsigned EltOffset1 = CI.Offset1 / CI.EltSize; CI.UseST64 = false; @@ -814,6 +916,11 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { if (MBBI->hasOrderedMemoryRef()) return false; + int Swizzled = + AMDGPU::getNamedOperandIdx(MBBI->getOpcode(), AMDGPU::OpName::swz); + if (Swizzled != -1 && MBBI->getOperand(Swizzled).getImm()) + return false; + // Handle a case like // DS_WRITE_B32 addr, v, idx0 // w = DS_READ_B32 addr, idx0 @@ -834,7 +941,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { bool canBeCombined = CI.InstClass == MIMG ? dmasksCanBeCombined(CI, *TII) - : widthsFit(*STM, CI) && offsetsCanBeCombined(CI); + : widthsFit(*STM, CI) && offsetsCanBeCombined(CI, *STI); // We also need to go through the list of instructions that we plan to // move and make sure they are all safe to move down past the merged @@ -1201,6 +1308,136 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) { return New; } +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeTBufferLoadPair(CombineInfo &CI) { + MachineBasicBlock *MBB = CI.I->getParent(); + DebugLoc DL = CI.I->getDebugLoc(); + + const unsigned Opcode = getNewOpcode(CI); + + const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); + + // Copy to the new source register. + Register DestReg = MRI->createVirtualRegister(SuperRC); + unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); + + auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg); + + const unsigned Regs = getRegs(Opcode, *TII); + + if (Regs & VADDR) + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); + + unsigned JoinedFormat = + getBufferFormatWithCompCount(CI.Format0, CI.Width0 + CI.Width1, *STI); + + // It shouldn't be possible to get this far if the two instructions + // don't have a single memoperand, because MachineInstr::mayAlias() + // will return true if this is the case. + assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand()); + + const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); + const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); + + MachineInstr *New = + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) + .addImm(MergedOffset) // offset + .addImm(JoinedFormat) // format + .addImm(CI.GLC0) // glc + .addImm(CI.SLC0) // slc + .addImm(0) // tfe + .addImm(CI.DLC0) // dlc + .addImm(0) // swz + .addMemOperand( + combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + + std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); + const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); + const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); + + // Copy to the old destination registers. + const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); + const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); + const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata); + + BuildMI(*MBB, CI.Paired, DL, CopyDesc) + .add(*Dest0) // Copy to same destination including flags and sub reg. + .addReg(DestReg, 0, SubRegIdx0); + MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) + .add(*Dest1) + .addReg(DestReg, RegState::Kill, SubRegIdx1); + + moveInstsAfter(Copy1, CI.InstsToMove); + + CI.I->eraseFromParent(); + CI.Paired->eraseFromParent(); + return New; +} + +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeTBufferStorePair(CombineInfo &CI) { + MachineBasicBlock *MBB = CI.I->getParent(); + DebugLoc DL = CI.I->getDebugLoc(); + + const unsigned Opcode = getNewOpcode(CI); + + std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); + const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); + const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); + + // Copy to the new source register. + const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); + Register SrcReg = MRI->createVirtualRegister(SuperRC); + + const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); + const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata); + + BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) + .add(*Src0) + .addImm(SubRegIdx0) + .add(*Src1) + .addImm(SubRegIdx1); + + auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode)) + .addReg(SrcReg, RegState::Kill); + + const unsigned Regs = getRegs(Opcode, *TII); + + if (Regs & VADDR) + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); + + unsigned JoinedFormat = + getBufferFormatWithCompCount(CI.Format0, CI.Width0 + CI.Width1, *STI); + + // It shouldn't be possible to get this far if the two instructions + // don't have a single memoperand, because MachineInstr::mayAlias() + // will return true if this is the case. + assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand()); + + const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); + const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); + + MachineInstr *New = + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) + .addImm(std::min(CI.Offset0, CI.Offset1)) // offset + .addImm(JoinedFormat) // format + .addImm(CI.GLC0) // glc + .addImm(CI.SLC0) // slc + .addImm(0) // tfe + .addImm(CI.DLC0) // dlc + .addImm(0) // swz + .addMemOperand( + combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + + moveInstsAfter(MIB, CI.InstsToMove); + + CI.I->eraseFromParent(); + CI.Paired->eraseFromParent(); + return New; +} + unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) { const unsigned Width = CI.Width0 + CI.Width1; @@ -1210,6 +1447,11 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) { // FIXME: Handle d16 correctly return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), Width); + case TBUFFER_LOAD: + case TBUFFER_STORE: + return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), + Width); + case UNKNOWN: llvm_unreachable("Unknown instruction class"); case S_BUFFER_LOAD_IMM: @@ -1819,6 +2061,24 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4; } break; + case TBUFFER_LOAD: + if (findMatchingInst(CI)) { + Modified = true; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeTBufferLoadPair(CI); + CI.setMI(NewMI, *TII, *STM); + OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4; + } + break; + case TBUFFER_STORE: + if (findMatchingInst(CI)) { + Modified = true; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeTBufferStorePair(CI); + CI.setMI(NewMI, *TII, *STM); + OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4; + } + break; } // Clear the InstsToMove after we have finished searching so we don't have // stale values left over if we search for this CI again in another pass @@ -1839,6 +2099,7 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { TII = STM->getInstrInfo(); TRI = &TII->getRegisterInfo(); + STI = &MF.getSubtarget<MCSubtargetInfo>(); MRI = &MF.getRegInfo(); AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); |