diff options
author | Neil Henning <neil.henning@amd.com> | 2018-12-12 16:15:21 +0000 |
---|---|---|
committer | Neil Henning <neil.henning@amd.com> | 2018-12-12 16:15:21 +0000 |
commit | 76504a4c5e196aac50afe65f1db55345b9a01b7e (patch) | |
tree | bacefc350bd264ec596c3b27c90fe518e97d8d38 /llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | |
parent | ef8683abec6b4f36cef4bba2fd6a4b69f0e59f22 (diff) | |
download | bcm5719-llvm-76504a4c5e196aac50afe65f1db55345b9a01b7e.tar.gz bcm5719-llvm-76504a4c5e196aac50afe65f1db55345b9a01b7e.zip |
[AMDGPU] Extend the SI Load/Store optimizer to combine more things.
I've extended the load/store optimizer to be able to produce dwordx3
loads and stores, This change allows many more load/stores to be combined,
and results in much more optimal code for our hardware.
Differential Revision: https://reviews.llvm.org/D54042
llvm-svn: 348937
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 680 |
1 files changed, 442 insertions, 238 deletions
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index e379e98623a..c7bec721166 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -43,9 +43,9 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIInstrInfo.h" #include "SIRegisterInfo.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallVector.h" @@ -74,23 +74,38 @@ using namespace llvm; #define DEBUG_TYPE "si-load-store-opt" namespace { +enum InstClassEnum { + UNKNOWN, + DS_READ, + DS_WRITE, + S_BUFFER_LOAD_IMM, + BUFFER_LOAD_OFFEN = AMDGPU::BUFFER_LOAD_DWORD_OFFEN, + BUFFER_LOAD_OFFSET = AMDGPU::BUFFER_LOAD_DWORD_OFFSET, + BUFFER_STORE_OFFEN = AMDGPU::BUFFER_STORE_DWORD_OFFEN, + BUFFER_STORE_OFFSET = AMDGPU::BUFFER_STORE_DWORD_OFFSET, + BUFFER_LOAD_OFFEN_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact, + BUFFER_LOAD_OFFSET_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact, + BUFFER_STORE_OFFEN_exact = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact, + BUFFER_STORE_OFFSET_exact = AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact, +}; -class SILoadStoreOptimizer : public MachineFunctionPass { - enum InstClassEnum { - DS_READ_WRITE, - S_BUFFER_LOAD_IMM, - BUFFER_LOAD_OFFEN, - BUFFER_LOAD_OFFSET, - BUFFER_STORE_OFFEN, - BUFFER_STORE_OFFSET, - }; +enum RegisterEnum { + SBASE = 0x1, + SRSRC = 0x2, + SOFFSET = 0x4, + VADDR = 0x8, + ADDR = 0x10, +}; +class SILoadStoreOptimizer : public MachineFunctionPass { struct CombineInfo { MachineBasicBlock::iterator I; MachineBasicBlock::iterator Paired; unsigned EltSize; unsigned Offset0; unsigned Offset1; + unsigned Width0; + unsigned Width1; unsigned BaseOff; InstClassEnum InstClass; bool GLC0; @@ -98,9 +113,8 @@ class SILoadStoreOptimizer : public MachineFunctionPass { bool SLC0; bool SLC1; bool UseST64; - bool IsX2; - SmallVector<MachineInstr*, 8> InstsToMove; - }; + SmallVector<MachineInstr *, 8> InstsToMove; + }; private: const GCNSubtarget *STM = nullptr; @@ -108,9 +122,16 @@ private: const SIRegisterInfo *TRI = nullptr; MachineRegisterInfo *MRI = nullptr; AliasAnalysis *AA = nullptr; - unsigned CreatedX2; + bool OptimizeAgain; static bool offsetsCanBeCombined(CombineInfo &CI); + static bool widthsFit(const CombineInfo &CI); + static unsigned getNewOpcode(const CombineInfo &CI); + static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI); + const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI); + unsigned getOpcodeWidth(const MachineInstr &MI); + InstClassEnum getInstClass(unsigned Opc); + unsigned getRegs(unsigned Opc); bool findMatchingInst(CombineInfo &CI); @@ -123,8 +144,6 @@ private: MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI); MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI); MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI); - unsigned promoteBufferStoreOpcode(const MachineInstr &I, bool &IsX2, - bool &IsOffen) const; MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI); public: @@ -153,8 +172,8 @@ public: INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, - "SI Load Store Optimizer", false, false) +INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", + false, false) char SILoadStoreOptimizer::ID = 0; @@ -165,7 +184,7 @@ FunctionPass *llvm::createSILoadStoreOptimizerPass() { } static void moveInstsAfter(MachineBasicBlock::iterator I, - ArrayRef<MachineInstr*> InstsToMove) { + ArrayRef<MachineInstr *> InstsToMove) { MachineBasicBlock *MBB = I->getParent(); ++I; for (MachineInstr *MI : InstsToMove) { @@ -191,21 +210,19 @@ static void addDefsUsesToList(const MachineInstr &MI, static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, MachineBasicBlock::iterator B, const SIInstrInfo *TII, - AliasAnalysis * AA) { + AliasAnalysis *AA) { // RAW or WAR - cannot reorder // WAW - cannot reorder // RAR - safe to reorder return !(A->mayStore() || B->mayStore()) || - TII->areMemAccessesTriviallyDisjoint(*A, *B, AA); + TII->areMemAccessesTriviallyDisjoint(*A, *B, AA); } // Add MI and its defs to the lists if MI reads one of the defs that are // already in the list. Returns true in that case. -static bool -addToListsIfDependent(MachineInstr &MI, - DenseSet<unsigned> &RegDefs, - DenseSet<unsigned> &PhysRegUses, - SmallVectorImpl<MachineInstr*> &Insts) { +static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs, + DenseSet<unsigned> &PhysRegUses, + SmallVectorImpl<MachineInstr *> &Insts) { for (MachineOperand &Use : MI.operands()) { // If one of the defs is read, then there is a use of Def between I and the // instruction that I will potentially be merged with. We will need to move @@ -228,18 +245,16 @@ addToListsIfDependent(MachineInstr &MI, return false; } -static bool -canMoveInstsAcrossMemOp(MachineInstr &MemOp, - ArrayRef<MachineInstr*> InstsToMove, - const SIInstrInfo *TII, - AliasAnalysis *AA) { +static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, + ArrayRef<MachineInstr *> InstsToMove, + const SIInstrInfo *TII, AliasAnalysis *AA) { assert(MemOp.mayLoadOrStore()); for (MachineInstr *InstToMove : InstsToMove) { if (!InstToMove->mayLoadOrStore()) continue; if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA)) - return false; + return false; } return true; } @@ -260,10 +275,9 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { CI.BaseOff = 0; // Handle SMEM and VMEM instructions. - if (CI.InstClass != DS_READ_WRITE) { - unsigned Diff = CI.IsX2 ? 2 : 1; - return (EltOffset0 + Diff == EltOffset1 || - EltOffset1 + Diff == EltOffset0) && + if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { + return (EltOffset0 + CI.Width0 == EltOffset1 || + EltOffset1 + CI.Width1 == EltOffset0) && CI.GLC0 == CI.GLC1 && (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1); } @@ -305,42 +319,175 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { return false; } +bool SILoadStoreOptimizer::widthsFit(const CombineInfo &CI) { + const unsigned Width = (CI.Width0 + CI.Width1); + switch (CI.InstClass) { + default: + return Width <= 4; + case S_BUFFER_LOAD_IMM: + switch (Width) { + default: + return false; + case 2: + case 4: + return true; + } + } +} + +unsigned SILoadStoreOptimizer::getOpcodeWidth(const MachineInstr &MI) { + const unsigned Opc = MI.getOpcode(); + + if (TII->isMUBUF(MI)) { + return AMDGPU::getMUBUFDwords(Opc); + } + + switch (Opc) { + default: + return 0; + case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: + return 1; + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + return 2; + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + return 4; + } +} + +InstClassEnum SILoadStoreOptimizer::getInstClass(unsigned Opc) { + if (TII->isMUBUF(Opc)) { + const int baseOpcode = AMDGPU::getMUBUFBaseOpcode(Opc); + + // If we couldn't identify the opcode, bail out. + if (baseOpcode == -1) { + return UNKNOWN; + } + + switch (baseOpcode) { + default: + return UNKNOWN; + case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: + return BUFFER_LOAD_OFFEN; + case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: + return BUFFER_LOAD_OFFSET; + case AMDGPU::BUFFER_STORE_DWORD_OFFEN: + return BUFFER_STORE_OFFEN; + case AMDGPU::BUFFER_STORE_DWORD_OFFSET: + return BUFFER_STORE_OFFSET; + case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: + return BUFFER_LOAD_OFFEN_exact; + case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: + return BUFFER_LOAD_OFFSET_exact; + case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: + return BUFFER_STORE_OFFEN_exact; + case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: + return BUFFER_STORE_OFFSET_exact; + } + } + + switch (Opc) { + default: + return UNKNOWN; + case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + return S_BUFFER_LOAD_IMM; + case AMDGPU::DS_READ_B32: + case AMDGPU::DS_READ_B64: + case AMDGPU::DS_READ_B32_gfx9: + case AMDGPU::DS_READ_B64_gfx9: + return DS_READ; + case AMDGPU::DS_WRITE_B32: + case AMDGPU::DS_WRITE_B64: + case AMDGPU::DS_WRITE_B32_gfx9: + case AMDGPU::DS_WRITE_B64_gfx9: + return DS_WRITE; + } +} + +unsigned SILoadStoreOptimizer::getRegs(unsigned Opc) { + if (TII->isMUBUF(Opc)) { + unsigned result = 0; + + if (AMDGPU::getMUBUFHasVAddr(Opc)) { + result |= VADDR; + } + + if (AMDGPU::getMUBUFHasSrsrc(Opc)) { + result |= SRSRC; + } + + if (AMDGPU::getMUBUFHasSoffset(Opc)) { + result |= SOFFSET; + } + + return result; + } + + switch (Opc) { + default: + return 0; + case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + return SBASE; + case AMDGPU::DS_READ_B32: + case AMDGPU::DS_READ_B64: + case AMDGPU::DS_READ_B32_gfx9: + case AMDGPU::DS_READ_B64_gfx9: + case AMDGPU::DS_WRITE_B32: + case AMDGPU::DS_WRITE_B64: + case AMDGPU::DS_WRITE_B32_gfx9: + case AMDGPU::DS_WRITE_B64_gfx9: + return ADDR; + } +} + bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); MachineBasicBlock::iterator E = MBB->end(); MachineBasicBlock::iterator MBBI = CI.I; - unsigned AddrOpName[3] = {0}; - int AddrIdx[3]; - const MachineOperand *AddrReg[3]; + const unsigned Opc = CI.I->getOpcode(); + const InstClassEnum InstClass = getInstClass(Opc); + + if (InstClass == UNKNOWN) { + return false; + } + + const unsigned Regs = getRegs(Opc); + + unsigned AddrOpName[5] = {0}; + int AddrIdx[5]; + const MachineOperand *AddrReg[5]; unsigned NumAddresses = 0; - switch (CI.InstClass) { - case DS_READ_WRITE: + if (Regs & ADDR) { AddrOpName[NumAddresses++] = AMDGPU::OpName::addr; - break; - case S_BUFFER_LOAD_IMM: + } + + if (Regs & SBASE) { AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase; - break; - case BUFFER_LOAD_OFFEN: - case BUFFER_STORE_OFFEN: - AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; - AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; - AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; - break; - case BUFFER_LOAD_OFFSET: - case BUFFER_STORE_OFFSET: + } + + if (Regs & SRSRC) { AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; + } + + if (Regs & SOFFSET) { AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; - break; + } + + if (Regs & VADDR) { + AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; } for (unsigned i = 0; i < NumAddresses; i++) { AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]); AddrReg[i] = &CI.I->getOperand(AddrIdx[i]); - // We only ever merge operations with the same base address register, so don't - // bother scanning forward if there are no other uses. + // We only ever merge operations with the same base address register, so + // don't bother scanning forward if there are no other uses. if (AddrReg[i]->isReg() && (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) || MRI->hasOneNonDBGUse(AddrReg[i]->getReg()))) @@ -353,8 +500,11 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { DenseSet<unsigned> PhysRegUsesToMove; addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); - for ( ; MBBI != E; ++MBBI) { - if (MBBI->getOpcode() != CI.I->getOpcode()) { + for (; MBBI != E; ++MBBI) { + const bool IsDS = (InstClass == DS_READ) || (InstClass == DS_WRITE); + + if ((getInstClass(MBBI->getOpcode()) != InstClass) || + (IsDS && (MBBI->getOpcode() != Opc))) { // This is not a matching DS instruction, but we can keep looking as // long as one of these conditions are met: // 1. It is safe to move I down past MBBI. @@ -368,8 +518,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { } if (MBBI->mayLoadOrStore() && - (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || - !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) { + (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || + !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) { // We fail condition #1, but we may still be able to satisfy condition // #2. Add this instruction to the move list and then we will check // if condition #2 holds once we have selected the matching instruction. @@ -413,8 +563,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { continue; } - // Check same base pointer. Be careful of subregisters, which can occur with - // vectors of pointers. + // Check same base pointer. Be careful of subregisters, which can occur + // with vectors of pointers. if (AddrReg[i]->getReg() != AddrRegNext.getReg() || AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { Match = false; @@ -423,13 +573,15 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { } if (Match) { - int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), - AMDGPU::OpName::offset); + int OffsetIdx = + AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset); CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm(); + CI.Width0 = getOpcodeWidth(*CI.I); CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm(); + CI.Width1 = getOpcodeWidth(*MBBI); CI.Paired = MBBI; - if (CI.InstClass == DS_READ_WRITE) { + if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) { CI.Offset0 &= 0xffff; CI.Offset1 &= 0xffff; } else { @@ -445,7 +597,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { // We also need to go through the list of instructions that we plan to // move and make sure they are all safe to move down past the merged // instruction. - if (offsetsCanBeCombined(CI)) + if (widthsFit(CI) && offsetsCanBeCombined(CI)) if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) return true; } @@ -472,12 +624,12 @@ unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { if (STM->ldsRequiresM0Init()) return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; - return (EltSize == 4) ? - AMDGPU::DS_READ2ST64_B32_gfx9 : AMDGPU::DS_READ2ST64_B64_gfx9; + return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 + : AMDGPU::DS_READ2ST64_B64_gfx9; } -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( - CombineInfo &CI) { +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); // Be careful, since the addresses could be subregisters themselves in weird @@ -489,8 +641,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( unsigned NewOffset0 = CI.Offset0; unsigned NewOffset1 = CI.Offset1; - unsigned Opc = CI.UseST64 ? - read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); + unsigned Opc = + CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; @@ -502,13 +654,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( } assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && - (NewOffset0 != NewOffset1) && - "Computed offset doesn't fit"); + (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); const MCInstrDesc &Read2Desc = TII->get(Opc); - const TargetRegisterClass *SuperRC - = (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; + const TargetRegisterClass *SuperRC = + (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; unsigned DestReg = MRI->createVirtualRegister(SuperRC); DebugLoc DL = CI.I->getDebugLoc(); @@ -519,23 +670,24 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( if (CI.BaseOff) { unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) - .addImm(CI.BaseOff); + .addImm(CI.BaseOff); BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); BaseRegFlags = RegState::Kill; TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) - .addReg(ImmReg) - .addReg(AddrReg->getReg(), 0, BaseSubReg); + .addReg(ImmReg) + .addReg(AddrReg->getReg(), 0, BaseSubReg); BaseSubReg = 0; } - MachineInstrBuilder Read2 = BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg) - .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr - .addImm(NewOffset0) // offset0 - .addImm(NewOffset1) // offset1 - .addImm(0) // gds - .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); + MachineInstrBuilder Read2 = + BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg) + .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr + .addImm(NewOffset0) // offset0 + .addImm(NewOffset1) // offset1 + .addImm(0) // gds + .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); (void)Read2; @@ -562,32 +714,36 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { if (STM->ldsRequiresM0Init()) return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; - return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 : AMDGPU::DS_WRITE2_B64_gfx9; + return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 + : AMDGPU::DS_WRITE2_B64_gfx9; } unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { if (STM->ldsRequiresM0Init()) - return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64; + return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 + : AMDGPU::DS_WRITE2ST64_B64; - return (EltSize == 4) ? - AMDGPU::DS_WRITE2ST64_B32_gfx9 : AMDGPU::DS_WRITE2ST64_B64_gfx9; + return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 + : AMDGPU::DS_WRITE2ST64_B64_gfx9; } -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( - CombineInfo &CI) { +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); // Be sure to use .addOperand(), and not .addReg() with these. We want to be // sure we preserve the subregister index and any register flags set on them. - const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); - const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); - const MachineOperand *Data1 - = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0); + const MachineOperand *AddrReg = + TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); + const MachineOperand *Data0 = + TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); + const MachineOperand *Data1 = + TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0); unsigned NewOffset0 = CI.Offset0; unsigned NewOffset1 = CI.Offset1; - unsigned Opc = CI.UseST64 ? - write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); + unsigned Opc = + CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); if (NewOffset0 > NewOffset1) { // Canonicalize the merged instruction so the smaller offset comes first. @@ -596,8 +752,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( } assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && - (NewOffset0 != NewOffset1) && - "Computed offset doesn't fit"); + (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); const MCInstrDesc &Write2Desc = TII->get(Opc); DebugLoc DL = CI.I->getDebugLoc(); @@ -608,25 +763,26 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( if (CI.BaseOff) { unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) - .addImm(CI.BaseOff); + .addImm(CI.BaseOff); BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); BaseRegFlags = RegState::Kill; TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) - .addReg(ImmReg) - .addReg(AddrReg->getReg(), 0, BaseSubReg); + .addReg(ImmReg) + .addReg(AddrReg->getReg(), 0, BaseSubReg); BaseSubReg = 0; } - MachineInstrBuilder Write2 = BuildMI(*MBB, CI.Paired, DL, Write2Desc) - .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr - .add(*Data0) // data0 - .add(*Data1) // data1 - .addImm(NewOffset0) // offset0 - .addImm(NewOffset1) // offset1 - .addImm(0) // gds - .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); + MachineInstrBuilder Write2 = + BuildMI(*MBB, CI.Paired, DL, Write2Desc) + .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr + .add(*Data0) // data0 + .add(*Data1) // data1 + .addImm(NewOffset0) // offset0 + .addImm(NewOffset1) // offset1 + .addImm(0) // gds + .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); moveInstsAfter(Write2, CI.InstsToMove); @@ -638,15 +794,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( return Next; } -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( - CombineInfo &CI) { +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); - unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM : - AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; + const unsigned Opcode = getNewOpcode(CI); + + const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); - const TargetRegisterClass *SuperRC = - CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass; unsigned DestReg = MRI->createVirtualRegister(SuperRC); unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); @@ -656,12 +811,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( .addImm(CI.GLC0) // glc .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); - unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; - unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; - - // Handle descending offsets - if (CI.Offset0 > CI.Offset1) - std::swap(SubRegIdx0, SubRegIdx1); + std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); + const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); + const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); // Copy to the old destination registers. const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); @@ -683,29 +835,25 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( return Next; } -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( - CombineInfo &CI) { +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); - unsigned Opcode; - if (CI.InstClass == BUFFER_LOAD_OFFEN) { - Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN : - AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; - } else { - Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET : - AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; - } + const unsigned Opcode = getNewOpcode(CI); - const TargetRegisterClass *SuperRC = - CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass; + const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); + + // Copy to the new source register. unsigned DestReg = MRI->createVirtualRegister(SuperRC); unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg); - if (CI.InstClass == BUFFER_LOAD_OFFEN) - MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); + const unsigned Regs = getRegs(Opcode); + + if (Regs & VADDR) + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) @@ -715,12 +863,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( .addImm(0) // tfe .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); - unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; - unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; - - // Handle descending offsets - if (CI.Offset0 > CI.Offset1) - std::swap(SubRegIdx0, SubRegIdx1); + std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); + const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); + const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); // Copy to the old destination registers. const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); @@ -742,57 +887,137 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( return Next; } -unsigned SILoadStoreOptimizer::promoteBufferStoreOpcode( - const MachineInstr &I, bool &IsX2, bool &IsOffen) const { - IsX2 = false; - IsOffen = false; - - switch (I.getOpcode()) { - case AMDGPU::BUFFER_STORE_DWORD_OFFEN: - IsOffen = true; - return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN; - case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: - IsOffen = true; - return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact; - case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: - IsX2 = true; - IsOffen = true; - return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN; - case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact: - IsX2 = true; - IsOffen = true; - return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact; - case AMDGPU::BUFFER_STORE_DWORD_OFFSET: - return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; - case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: - return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact; - case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET: - IsX2 = true; - return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; - case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact: - IsX2 = true; - return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact; - } - return 0; +unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) { + const unsigned Width = CI.Width0 + CI.Width1; + + switch (CI.InstClass) { + default: + return AMDGPU::getMUBUFOpcode(CI.InstClass, Width); + case UNKNOWN: + llvm_unreachable("Unknown instruction class"); + case S_BUFFER_LOAD_IMM: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; + case 4: + return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; + } + } +} + +std::pair<unsigned, unsigned> +SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) { + if (CI.Offset0 > CI.Offset1) { + switch (CI.Width0) { + default: + return std::make_pair(0, 0); + case 1: + switch (CI.Width1) { + default: + return std::make_pair(0, 0); + case 1: + return std::make_pair(AMDGPU::sub1, AMDGPU::sub0); + case 2: + return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1); + case 3: + return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2); + } + case 2: + switch (CI.Width1) { + default: + return std::make_pair(0, 0); + case 1: + return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0); + case 2: + return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1); + } + case 3: + switch (CI.Width1) { + default: + return std::make_pair(0, 0); + case 1: + return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0); + } + } + } else { + switch (CI.Width0) { + default: + return std::make_pair(0, 0); + case 1: + switch (CI.Width1) { + default: + return std::make_pair(0, 0); + case 1: + return std::make_pair(AMDGPU::sub0, AMDGPU::sub1); + case 2: + return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2); + case 3: + return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3); + } + case 2: + switch (CI.Width1) { + default: + return std::make_pair(0, 0); + case 1: + return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2); + case 2: + return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3); + } + case 3: + switch (CI.Width1) { + default: + return std::make_pair(0, 0); + case 1: + return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3); + } + } + } +} + +const TargetRegisterClass * +SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) { + if (CI.InstClass == S_BUFFER_LOAD_IMM) { + switch (CI.Width0 + CI.Width1) { + default: + return nullptr; + case 2: + return &AMDGPU::SReg_64_XEXECRegClass; + case 4: + return &AMDGPU::SReg_128RegClass; + case 8: + return &AMDGPU::SReg_256RegClass; + case 16: + return &AMDGPU::SReg_512RegClass; + } + } else { + switch (CI.Width0 + CI.Width1) { + default: + return nullptr; + case 2: + return &AMDGPU::VReg_64RegClass; + case 3: + return &AMDGPU::VReg_96RegClass; + case 4: + return &AMDGPU::VReg_128RegClass; + } + } } -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( - CombineInfo &CI) { +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); - bool Unused1, Unused2; - unsigned Opcode = promoteBufferStoreOpcode(*CI.I, Unused1, Unused2); - unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; - unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; + const unsigned Opcode = getNewOpcode(CI); - // Handle descending offsets - if (CI.Offset0 > CI.Offset1) - std::swap(SubRegIdx0, SubRegIdx1); + std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); + const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); + const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); // Copy to the new source register. - const TargetRegisterClass *SuperRC = - CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass; + const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); unsigned SrcReg = MRI->createVirtualRegister(SuperRC); const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); @@ -805,10 +1030,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( .addImm(SubRegIdx1); auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode)) - .addReg(SrcReg, RegState::Kill); + .addReg(SrcReg, RegState::Kill); - if (CI.InstClass == BUFFER_STORE_OFFEN) - MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); + const unsigned Regs = getRegs(Opcode); + + if (Regs & VADDR) + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) @@ -841,90 +1068,69 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { continue; } + const unsigned Opc = MI.getOpcode(); + CombineInfo CI; CI.I = I; - unsigned Opc = MI.getOpcode(); - if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 || - Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) { + CI.InstClass = getInstClass(Opc); - CI.InstClass = DS_READ_WRITE; + switch (CI.InstClass) { + default: + break; + case DS_READ: CI.EltSize = - (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4; - + (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 + : 4; if (findMatchingInst(CI)) { Modified = true; I = mergeRead2Pair(CI); } else { ++I; } - continue; - } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 || - Opc == AMDGPU::DS_WRITE_B32_gfx9 || - Opc == AMDGPU::DS_WRITE_B64_gfx9) { - CI.InstClass = DS_READ_WRITE; - CI.EltSize - = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4; - + case DS_WRITE: + CI.EltSize = + (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 + : 4; if (findMatchingInst(CI)) { Modified = true; I = mergeWrite2Pair(CI); } else { ++I; } - continue; - } - if (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM || - Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM) { - // EltSize is in units of the offset encoding. - CI.InstClass = S_BUFFER_LOAD_IMM; + case S_BUFFER_LOAD_IMM: CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4); - CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; if (findMatchingInst(CI)) { Modified = true; I = mergeSBufferLoadImmPair(CI); - if (!CI.IsX2) - CreatedX2++; + OptimizeAgain |= (CI.Width0 + CI.Width1) < 16; } else { ++I; } continue; - } - if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN || - Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN || - Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFSET || - Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET) { - if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN || - Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN) - CI.InstClass = BUFFER_LOAD_OFFEN; - else - CI.InstClass = BUFFER_LOAD_OFFSET; - + case BUFFER_LOAD_OFFEN: + case BUFFER_LOAD_OFFSET: + case BUFFER_LOAD_OFFEN_exact: + case BUFFER_LOAD_OFFSET_exact: CI.EltSize = 4; - CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN || - Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; if (findMatchingInst(CI)) { Modified = true; I = mergeBufferLoadPair(CI); - if (!CI.IsX2) - CreatedX2++; + OptimizeAgain |= (CI.Width0 + CI.Width1) < 4; } else { ++I; } continue; - } - - bool StoreIsX2, IsOffen; - if (promoteBufferStoreOpcode(*I, StoreIsX2, IsOffen)) { - CI.InstClass = IsOffen ? BUFFER_STORE_OFFEN : BUFFER_STORE_OFFSET; + case BUFFER_STORE_OFFEN: + case BUFFER_STORE_OFFSET: + case BUFFER_STORE_OFFEN_exact: + case BUFFER_STORE_OFFSET_exact: CI.EltSize = 4; - CI.IsX2 = StoreIsX2; if (findMatchingInst(CI)) { Modified = true; I = mergeBufferStorePair(CI); - if (!CI.IsX2) - CreatedX2++; + OptimizeAgain |= (CI.Width0 + CI.Width1) < 4; } else { ++I; } @@ -958,12 +1164,10 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { bool Modified = false; for (MachineBasicBlock &MBB : MF) { - CreatedX2 = 0; - Modified |= optimizeBlock(MBB); - - // Run again to convert x2 to x4. - if (CreatedX2 >= 1) + do { + OptimizeAgain = false; Modified |= optimizeBlock(MBB); + } while (OptimizeAgain); } return Modified; |