diff options
| author | Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> | 2018-11-12 18:48:17 +0000 |
|---|---|---|
| committer | Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> | 2018-11-12 18:48:17 +0000 |
| commit | e86c8d33b1d3eacaa616e3d8a6c99638cbe3f9e3 (patch) | |
| tree | d2eebd4b195751f663b78768206a66df0bfd8c41 /llvm/lib/Target | |
| parent | 8512e5909e1e5bd996e88e96df9ed572b6016f2e (diff) | |
| download | bcm5719-llvm-e86c8d33b1d3eacaa616e3d8a6c99638cbe3f9e3.tar.gz bcm5719-llvm-e86c8d33b1d3eacaa616e3d8a6c99638cbe3f9e3.zip | |
[AMDGPU] Optimize S_CBRANCH_VCC[N]Z -> S_CBRANCH_EXEC[N]Z
Sometimes after basic block placement we end up with a code like:
sreg = s_mov_b64 -1
vcc = s_and_b64 exec, sreg
s_cbranch_vccz
This happens as a join of a block assigning -1 to a saved mask and
another block which consumes that saved mask with s_and_b64 and a
branch.
This is essentially a single s_cbranch_execz instruction when moved
into a single new basic block.
Differential Revision: https://reviews.llvm.org/D54164
llvm-svn: 346690
Diffstat (limited to 'llvm/lib/Target')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInsertSkips.cpp | 97 |
1 files changed, 97 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp index dc9397cf7b8..f23fa02bf8a 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -66,6 +66,8 @@ private: bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB); + bool optimizeVccBranch(MachineInstr &MI) const; + public: static char ID; @@ -320,6 +322,96 @@ bool SIInsertSkips::skipMaskBranch(MachineInstr &MI, return true; } +bool SIInsertSkips::optimizeVccBranch(MachineInstr &MI) const { + // Match: + // sreg = -1 + // vcc = S_AND_B64 exec, sreg + // S_CBRANCH_VCC[N]Z + // => + // S_CBRANCH_EXEC[N]Z + bool Changed = false; + MachineBasicBlock &MBB = *MI.getParent(); + const unsigned CondReg = AMDGPU::VCC; + const unsigned ExecReg = AMDGPU::EXEC; + const unsigned And = AMDGPU::S_AND_B64; + + MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(), + E = MBB.rend(); + bool ReadsCond = false; + unsigned Threshold = 5; + for (++A ; A != E ; ++A) { + if (!--Threshold) + return false; + if (A->modifiesRegister(ExecReg, TRI)) + return false; + if (A->modifiesRegister(CondReg, TRI)) { + if (!A->definesRegister(CondReg, TRI) || A->getOpcode() != And) + return false; + break; + } + ReadsCond |= A->readsRegister(CondReg, TRI); + } + if (A == E) + return false; + + MachineOperand &Op1 = A->getOperand(1); + MachineOperand &Op2 = A->getOperand(2); + if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) { + TII->commuteInstruction(*A); + Changed = true; + } + if (Op1.getReg() != ExecReg) + return Changed; + if (Op2.isImm() && Op2.getImm() != -1) + return Changed; + + unsigned SReg = AMDGPU::NoRegister; + if (Op2.isReg()) { + SReg = Op2.getReg(); + auto M = std::next(A); + bool ReadsSreg = false; + for ( ; M != E ; ++M) { + if (M->definesRegister(SReg, TRI)) + break; + if (M->modifiesRegister(SReg, TRI)) + return Changed; + ReadsSreg |= M->readsRegister(SReg, TRI); + } + if (M == E || + !M->isMoveImmediate() || + !M->getOperand(1).isImm() || + M->getOperand(1).getImm() != -1) + return Changed; + // First if sreg is only used in and instruction fold the immediate + // into that and. + if (!ReadsSreg && Op2.isKill()) { + A->getOperand(2).ChangeToImmediate(-1); + M->eraseFromParent(); + } + } + + if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) && + MI.killsRegister(CondReg, TRI)) + A->eraseFromParent(); + + bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ; + if (SReg == ExecReg) { + if (IsVCCZ) { + MI.eraseFromParent(); + return true; + } + MI.setDesc(TII->get(AMDGPU::S_BRANCH)); + } else { + MI.setDesc(TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ + : AMDGPU::S_CBRANCH_EXECNZ)); + } + + MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI)); + MI.addImplicitDefUseOperands(*MBB.getParent()); + + return true; +} + bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); TII = ST.getInstrInfo(); @@ -417,6 +509,11 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { } break; + case AMDGPU::S_CBRANCH_VCCZ: + case AMDGPU::S_CBRANCH_VCCNZ: + MadeChange |= optimizeVccBranch(MI); + break; + default: break; } |

