diff options
author | Marek Olsak <marek.olsak@amd.com> | 2017-11-09 01:52:23 +0000 |
---|---|---|
committer | Marek Olsak <marek.olsak@amd.com> | 2017-11-09 01:52:23 +0000 |
commit | b953cc36e2b97726a98af9a57e9ae1dfb3a747fc (patch) | |
tree | f360588065fdfdaeeb242b7c4dd71559dce19663 /llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | |
parent | ffadcb744bc3d8ce0ca516bab09445643c1061a5 (diff) | |
download | bcm5719-llvm-b953cc36e2b97726a98af9a57e9ae1dfb3a747fc.tar.gz bcm5719-llvm-b953cc36e2b97726a98af9a57e9ae1dfb3a747fc.zip |
AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4
Summary:
Only constant offsets (*_IMM opcodes) are merged.
It reuses code for LDS load/store merging.
It relies on the scheduler to group loads.
The results are mixed, I think they are mostly positive. Most shaders are
affected, so here are total stats only:
SGPRS: 2072198 -> 2151462 (3.83 %)
VGPRS: 1628024 -> 1634612 (0.40 %)
Spilled SGPRs: 7883 -> 8942 (13.43 %)
Spilled VGPRs: 97 -> 101 (4.12 %)
Scratch size: 1488 -> 1492 (0.27 %) dwords per thread
Code Size: 60222620 -> 52940672 (-12.09 %) bytes
Max Waves: 374337 -> 373066 (-0.34 %)
There is 13.4% increase in SGPR spilling, DiRT Showdown spills a few more
VGPRs (now 37), but 12% decrease in code size.
These are the new stats for SGPR spilling. We already spill a lot SGPRs,
so it's uncertain whether more spilling will make any difference since
SGPRs are always spilled to VGPRs:
SGPR SPILLING APPS Shaders SpillSGPR AvgPerSh
alien_isolation 2938 100 0.0
batman_arkham_origins 589 6 0.0
bioshock-infinite 1769 4 0.0
borderlands2 3968 22 0.0
counter_strike_glob.. 1142 60 0.1
deus_ex_mankind_div.. 1410 79 0.1
dirt-showdown 533 4 0.0
dirt_rally 364 1163 3.2
divinity 1052 2 0.0
dota2 1747 7 0.0
f1-2015 776 1515 2.0
grid_autosport 1767 1505 0.9
hitman 1413 273 0.2
left_4_dead_2 1762 4 0.0
life_is_strange 1296 26 0.0
mad_max 358 96 0.3
metro_2033_redux 2670 60 0.0
payday2 1362 22 0.0
portal 474 3 0.0
saints_row_iv 1704 8 0.0
serious_sam_3_bfe 392 1348 3.4
shadow_of_mordor 1418 12 0.0
shadow_warrior 3956 239 0.1
talos_principle 324 1735 5.4
thea 172 17 0.1
tomb_raider 1449 215 0.1
total_war_warhammer 242 56 0.2
ue4_effects_cave 295 55 0.2
ue4_elemental 572 12 0.0
unigine_tropics 210 56 0.3
unigine_valley 278 152 0.5
victor_vran 1262 84 0.1
yofrankie 82 2 0.0
Reviewers: arsenm, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D38949
llvm-svn: 317751
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 131 |
1 files changed, 117 insertions, 14 deletions
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 026fd974324..712fd687a71 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -14,6 +14,12 @@ // ==> // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 // +// The same is done for certain SMEM opcodes, e.g.: +// s_buffer_load_dword s4, s[0:3], 4 +// s_buffer_load_dword s5, s[0:3], 8 +// ==> +// s_buffer_load_dwordx2 s[4:5], s[0:3], 4 +// // // Future improvements: // @@ -76,23 +82,28 @@ class SILoadStoreOptimizer : public MachineFunctionPass { unsigned Offset0; unsigned Offset1; unsigned BaseOff; + bool GLC0; + bool GLC1; bool UseST64; + bool IsSBufferLoadImm; + bool IsX2; SmallVector<MachineInstr*, 8> InstsToMove; }; private: + const SISubtarget *STM = nullptr; const SIInstrInfo *TII = nullptr; const SIRegisterInfo *TRI = nullptr; MachineRegisterInfo *MRI = nullptr; AliasAnalysis *AA = nullptr; + unsigned CreatedX2; static bool offsetsCanBeCombined(CombineInfo &CI); - bool findMatchingDSInst(CombineInfo &CI); - + bool findMatchingInst(CombineInfo &CI); MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI); - MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI); + MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI); public: static char ID; @@ -210,6 +221,14 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { CI.UseST64 = false; CI.BaseOff = 0; + // SMEM offsets must be consecutive. + if (CI.IsSBufferLoadImm) { + unsigned Diff = CI.IsX2 ? 2 : 1; + return (EltOffset0 + Diff == EltOffset1 || + EltOffset1 + Diff == EltOffset0) && + CI.GLC0 == CI.GLC1; + } + // If the offset in elements doesn't fit in 8-bits, we might be able to use // the stride 64 versions. if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && @@ -247,13 +266,18 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { return false; } -bool SILoadStoreOptimizer::findMatchingDSInst(CombineInfo &CI) { +bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); MachineBasicBlock::iterator E = MBB->end(); MachineBasicBlock::iterator MBBI = CI.I; - int AddrIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), - AMDGPU::OpName::addr); + unsigned AddrOpName; + if (CI.IsSBufferLoadImm) + AddrOpName = AMDGPU::OpName::sbase; + else + AddrOpName = AMDGPU::OpName::addr; + + int AddrIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName); const MachineOperand &AddrReg0 = CI.I->getOperand(AddrIdx); // We only ever merge operations with the same base address register, so don't @@ -319,10 +343,18 @@ bool SILoadStoreOptimizer::findMatchingDSInst(CombineInfo &CI) { AddrReg0.getSubReg() == AddrReg1.getSubReg()) { int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset); - CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm() & 0xffff; - CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff; + CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm(); + CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm(); CI.Paired = MBBI; + if (CI.IsSBufferLoadImm) { + CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm(); + CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm(); + } else { + CI.Offset0 &= 0xffff; + CI.Offset1 &= 0xffff; + } + // Check both offsets fit in the reduced range. // We also need to go through the list of instructions that we plan to // move and make sure they are all safe to move down past the merged @@ -488,6 +520,51 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( return Next; } +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( + CombineInfo &CI) { + MachineBasicBlock *MBB = CI.I->getParent(); + DebugLoc DL = CI.I->getDebugLoc(); + unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM : + AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; + + const TargetRegisterClass *SuperRC = + CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass; + unsigned DestReg = MRI->createVirtualRegister(SuperRC); + unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); + + BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) + .addImm(MergedOffset) // offset + .addImm(CI.GLC0) // glc + .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); + + unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; + unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; + + // Handle descending offsets + if (CI.Offset0 > CI.Offset1) + std::swap(SubRegIdx0, SubRegIdx1); + + // Copy to the old destination registers. + const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); + const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); + const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst); + + BuildMI(*MBB, CI.Paired, DL, CopyDesc) + .add(*Dest0) // Copy to same destination including flags and sub reg. + .addReg(DestReg, 0, SubRegIdx0); + MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) + .add(*Dest1) + .addReg(DestReg, RegState::Kill, SubRegIdx1); + + moveInstsAfter(Copy1, CI.InstsToMove); + + MachineBasicBlock::iterator Next = std::next(CI.I); + CI.I->eraseFromParent(); + CI.Paired->eraseFromParent(); + return Next; +} + // Scan through looking for adjacent LDS operations with constant offsets from // the same base register. We rely on the scheduler to do the hard work of // clustering nearby loads, and assume these are all adjacent. @@ -505,10 +582,11 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { CombineInfo CI; CI.I = I; + CI.IsSBufferLoadImm = false; unsigned Opc = MI.getOpcode(); if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) { CI.EltSize = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4; - if (findMatchingDSInst(CI)) { + if (findMatchingInst(CI)) { Modified = true; I = mergeRead2Pair(CI); } else { @@ -516,9 +594,10 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { } continue; - } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) { + } + if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) { CI.EltSize = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4; - if (findMatchingDSInst(CI)) { + if (findMatchingInst(CI)) { Modified = true; I = mergeWrite2Pair(CI); } else { @@ -527,6 +606,23 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { continue; } + if (STM->hasSBufferLoadStoreAtomicDwordxN() && + (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM || + Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM)) { + // EltSize is in units of the offset encoding. + CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4); + CI.IsSBufferLoadImm = true; + CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; + if (findMatchingInst(CI)) { + Modified = true; + I = mergeSBufferLoadImmPair(CI); + if (!CI.IsX2) + CreatedX2++; + } else { + ++I; + } + continue; + } ++I; } @@ -538,11 +634,11 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(*MF.getFunction())) return false; - const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); - if (!STM.loadStoreOptEnabled()) + STM = &MF.getSubtarget<SISubtarget>(); + if (!STM->loadStoreOptEnabled()) return false; - TII = STM.getInstrInfo(); + TII = STM->getInstrInfo(); TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); @@ -553,9 +649,16 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); bool Modified = false; + CreatedX2 = 0; for (MachineBasicBlock &MBB : MF) Modified |= optimizeBlock(MBB); + // Run again to convert x2 to x4. + if (CreatedX2 >= 1) { + for (MachineBasicBlock &MBB : MF) + Modified |= optimizeBlock(MBB); + } + return Modified; } |