diff options
author | Marek Olsak <marek.olsak@amd.com> | 2017-11-09 01:52:55 +0000 |
---|---|---|
committer | Marek Olsak <marek.olsak@amd.com> | 2017-11-09 01:52:55 +0000 |
commit | 58410f37ff58c5778d349725458011a57ee21bf9 (patch) | |
tree | 05f88334afc264f9fe1845fdcf63162f7ba20ac2 /llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | |
parent | 5cec64195ceea262f86ae3d305607eb4e7840d88 (diff) | |
download | bcm5719-llvm-58410f37ff58c5778d349725458011a57ee21bf9.tar.gz bcm5719-llvm-58410f37ff58c5778d349725458011a57ee21bf9.zip |
AMDGPU: Merge BUFFER_STORE_DWORD_OFFEN/OFFSET into x2, x4
Summary:
Only 56 shaders (out of 48486) are affected.
Totals from affected shaders (changed stats only):
SGPRS: 2420 -> 2460 (1.65 %)
Spilled VGPRs: 94 -> 112 (19.15 %)
Scratch size: 524 -> 528 (0.76 %) dwords per thread
Code Size: 187400 -> 184992 (-1.28 %) bytes
One DiRT Showdown shader spills 6 more VGPRs.
One Grid Autosport shader spills 12 more VGPRs.
The other 54 shaders only have a decrease in code size.
(I'm ignoring the SGPR noise)
Reviewers: arsenm, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D39012
llvm-svn: 317755
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 113 |
1 files changed, 109 insertions, 4 deletions
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 3c1657bd252..eff74d609f0 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -80,6 +80,8 @@ class SILoadStoreOptimizer : public MachineFunctionPass { S_BUFFER_LOAD_IMM, BUFFER_LOAD_OFFEN, BUFFER_LOAD_OFFSET, + BUFFER_STORE_OFFEN, + BUFFER_STORE_OFFSET, }; struct CombineInfo { @@ -114,6 +116,9 @@ private: MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI); MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI); MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI); + unsigned promoteBufferStoreOpcode(const MachineInstr &I, bool &IsX2, + bool &IsOffen) const; + MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI); public: static char ID; @@ -231,10 +236,8 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { CI.UseST64 = false; CI.BaseOff = 0; - // SMEM offsets must be consecutive. - if (CI.InstClass == S_BUFFER_LOAD_IMM || - CI.InstClass == BUFFER_LOAD_OFFEN || - CI.InstClass == BUFFER_LOAD_OFFSET) { + // Handle SMEM and VMEM instructions. + if (CI.InstClass != DS_READ_WRITE) { unsigned Diff = CI.IsX2 ? 2 : 1; return (EltOffset0 + Diff == EltOffset1 || EltOffset1 + Diff == EltOffset0) && @@ -297,11 +300,13 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase; break; case BUFFER_LOAD_OFFEN: + case BUFFER_STORE_OFFEN: AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; break; case BUFFER_LOAD_OFFSET: + case BUFFER_STORE_OFFSET: AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; break; @@ -680,6 +685,90 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( return Next; } +unsigned SILoadStoreOptimizer::promoteBufferStoreOpcode( + const MachineInstr &I, bool &IsX2, bool &IsOffen) const { + IsX2 = false; + IsOffen = false; + + switch (I.getOpcode()) { + case AMDGPU::BUFFER_STORE_DWORD_OFFEN: + IsOffen = true; + return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN; + case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: + IsOffen = true; + return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact; + case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: + IsX2 = true; + IsOffen = true; + return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN; + case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact: + IsX2 = true; + IsOffen = true; + return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact; + case AMDGPU::BUFFER_STORE_DWORD_OFFSET: + return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; + case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: + return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact; + case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET: + IsX2 = true; + return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; + case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact: + IsX2 = true; + return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact; + } + return 0; +} + +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( + CombineInfo &CI) { + MachineBasicBlock *MBB = CI.I->getParent(); + DebugLoc DL = CI.I->getDebugLoc(); + bool Unused1, Unused2; + unsigned Opcode = promoteBufferStoreOpcode(*CI.I, Unused1, Unused2); + + unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; + unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; + + // Handle descending offsets + if (CI.Offset0 > CI.Offset1) + std::swap(SubRegIdx0, SubRegIdx1); + + // Copy to the new source register. + const TargetRegisterClass *SuperRC = + CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass; + unsigned SrcReg = MRI->createVirtualRegister(SuperRC); + + const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); + const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata); + + BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) + .add(*Src0) + .addImm(SubRegIdx0) + .add(*Src1) + .addImm(SubRegIdx1); + + auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode)) + .addReg(SrcReg, RegState::Kill); + + if (CI.InstClass == BUFFER_STORE_OFFEN) + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); + + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) + .addImm(std::min(CI.Offset0, CI.Offset1)) // offset + .addImm(CI.GLC0) // glc + .addImm(CI.SLC0) // slc + .addImm(0) // tfe + .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); + + moveInstsAfter(MIB, CI.InstsToMove); + + MachineBasicBlock::iterator Next = std::next(CI.I); + CI.I->eraseFromParent(); + CI.Paired->eraseFromParent(); + return Next; +} + // Scan through looking for adjacent LDS operations with constant offsets from // the same base register. We rely on the scheduler to do the hard work of // clustering nearby loads, and assume these are all adjacent. @@ -763,6 +852,22 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { continue; } + bool StoreIsX2, IsOffen; + if (promoteBufferStoreOpcode(*I, StoreIsX2, IsOffen)) { + CI.InstClass = IsOffen ? BUFFER_STORE_OFFEN : BUFFER_STORE_OFFSET; + CI.EltSize = 4; + CI.IsX2 = StoreIsX2; + if (findMatchingInst(CI)) { + Modified = true; + I = mergeBufferStorePair(CI); + if (!CI.IsX2) + CreatedX2++; + } else { + ++I; + } + continue; + } + ++I; } |