diff options
author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2016-09-29 01:44:16 +0000 |
---|---|---|
committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2016-09-29 01:44:16 +0000 |
commit | e6740754f083f7d85e5a8afcc959c8d5b13b32c6 (patch) | |
tree | 6c1aff82a41ecb07f3ace675912f966aa4a68fd3 /llvm/lib/Target | |
parent | ae689e3498d9ad5fe9aa7d759a2f8cf9f017563c (diff) | |
download | bcm5719-llvm-e6740754f083f7d85e5a8afcc959c8d5b13b32c6.tar.gz bcm5719-llvm-e6740754f083f7d85e5a8afcc959c8d5b13b32c6.zip |
AMDGPU: Partially fix control flow at -O0
Fixes to allow spilling all registers at the end of the block
work with exec modifications. Don't emit s_and_saveexec_b64 for
if lowering, and instead emit copies. Mark control flow mask
instructions as terminators to get correct spill code placement
with fast regalloc, and then have a separate optimization pass
form the saveexec.
This should work if SGPRs are spilled to VGPRs, but
will likely fail in the case that an SGPR spills to memory
and no workitem takes a divergent branch.
llvm-svn: 282667
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.h | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 21 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 19 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstructions.td | 25 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp | 74 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp | 304 |
7 files changed, 426 insertions, 21 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index b75a6c03b83..2610dfe07da 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -73,6 +73,9 @@ extern char &SILowerControlFlowID; void initializeSIInsertSkipsPass(PassRegistry &); extern char &SIInsertSkipsPassID; +void initializeSIOptimizeExecMaskingPass(PassRegistry &); +extern char &SIOptimizeExecMaskingID; + // Passes common to R600 and SI FunctionPass *createAMDGPUPromoteAlloca(const TargetMachine *TM = nullptr); void initializeAMDGPUPromoteAllocaPass(PassRegistry&); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 2a9f76212b9..aa2ebbe2852 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -83,6 +83,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeSILowerControlFlowPass(*PR); initializeSIInsertSkipsPass(*PR); initializeSIDebuggerInsertNopsPass(*PR); + initializeSIOptimizeExecMaskingPass(*PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { @@ -333,6 +334,7 @@ public: void addFastRegAlloc(FunctionPass *RegAllocPass) override; void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; void addPreRegAlloc() override; + void addPostRegAlloc() override; void addPreSched2() override; void addPreEmitPass() override; }; @@ -548,7 +550,6 @@ bool GCNPassConfig::addGlobalInstructionSelect() { #endif void GCNPassConfig::addPreRegAlloc() { - addPass(createSIShrinkInstructionsPass()); addPass(createSIWholeQuadModePass()); } @@ -556,7 +557,11 @@ void GCNPassConfig::addPreRegAlloc() { void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { // FIXME: We have to disable the verifier here because of PHIElimination + // TwoAddressInstructions disabling it. - insertPass(&TwoAddressInstructionPassID, &SILowerControlFlowID, false); + + // This must be run immediately after phi elimination and before + // TwoAddressInstructions, otherwise the processing of the tied operand of + // SI_ELSE will introduce a copy of the tied operand source after the else. + insertPass(&PHIEliminationID, &SILowerControlFlowID, false); TargetPassConfig::addFastRegAlloc(RegAllocPass); } @@ -566,13 +571,19 @@ void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { // passes might recompute live intervals. insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID); - // TODO: It might be better to run this right after phi elimination, but for - // now that would require not running the verifier. - insertPass(&RenameIndependentSubregsID, &SILowerControlFlowID); + // This must be run immediately after phi elimination and before + // TwoAddressInstructions, otherwise the processing of the tied operand of + // SI_ELSE will introduce a copy of the tied operand source after the else. + insertPass(&PHIEliminationID, &SILowerControlFlowID, false); TargetPassConfig::addOptimizedRegAlloc(RegAllocPass); } +void GCNPassConfig::addPostRegAlloc() { + addPass(&SIOptimizeExecMaskingID); + TargetPassConfig::addPostRegAlloc(); +} + void GCNPassConfig::addPreSched2() { } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index e58e5b2f92d..ae8862fd1bb 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -77,6 +77,7 @@ add_llvm_target(AMDGPUCodeGen SILowerI1Copies.cpp SIMachineFunctionInfo.cpp SIMachineScheduler.cpp + SIOptimizeExecMasking.cpp SIRegisterInfo.cpp SIShrinkInstructions.cpp SITypeRewriter.cpp diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 5426f7f3ced..0fed33e3214 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -856,7 +856,24 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { DebugLoc DL = MBB.findDebugLoc(MI); switch (MI.getOpcode()) { default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); - + case AMDGPU::S_MOV_B64_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_MOV_B64)); + break; + } + case AMDGPU::S_XOR_B64_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_XOR_B64)); + break; + } + case AMDGPU::S_ANDN2_B64_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_ANDN2_B64)); + break; + } case AMDGPU::V_MOV_B64_PSEUDO: { unsigned Dst = MI.getOperand(0).getReg(); unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 60c65e9675d..e4114adf599 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -112,6 +112,27 @@ def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins), [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>; } // End let usesCustomInserter = 1, SALU = 1 +def S_MOV_B64_term : PseudoInstSI<(outs SReg_64:$dst), + (ins SSrc_b64:$src0)> { + let SALU = 1; + let isAsCheapAsAMove = 1; + let isTerminator = 1; +} + +def S_XOR_B64_term : PseudoInstSI<(outs SReg_64:$dst), + (ins SSrc_b64:$src0, SSrc_b64:$src1)> { + let SALU = 1; + let isAsCheapAsAMove = 1; + let isTerminator = 1; +} + +def S_ANDN2_B64_term : PseudoInstSI<(outs SReg_64:$dst), + (ins SSrc_b64:$src0, SSrc_b64:$src1)> { + let SALU = 1; + let isAsCheapAsAMove = 1; + let isTerminator = 1; +} + // SI pseudo instructions. These are used by the CFG structurizer pass // and should be lowered to ISA instructions prior to codegen. @@ -132,9 +153,9 @@ def SI_IF: CFPseudoInstSI < (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target), [(set i64:$dst, (int_amdgcn_if i1:$vcc, bb:$target))], 1, 1> { let Constraints = ""; - let Size = 8; - let mayStore = 1; + let Size = 12; let mayLoad = 1; + let mayStore = 1; let hasSideEffects = 1; } diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index a4355982d27..d81a650a291 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -70,6 +70,7 @@ private: const SIRegisterInfo *TRI; const SIInstrInfo *TII; LiveIntervals *LIS; + MachineRegisterInfo *MRI; void emitIf(MachineInstr &MI); void emitElse(MachineInstr &MI); @@ -86,7 +87,8 @@ public: MachineFunctionPass(ID), TRI(nullptr), TII(nullptr), - LIS(nullptr) {} + LIS(nullptr), + MRI(nullptr) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -95,8 +97,12 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addPreserved<LiveIntervals>(); + // Should preserve the same set that TwoAddressInstructions does. AU.addPreserved<SlotIndexes>(); + AU.addPreserved<LiveIntervals>(); + AU.addPreservedID(LiveVariablesID); + AU.addPreservedID(MachineLoopInfoID); + AU.addPreservedID(MachineDominatorsID); AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -109,6 +115,13 @@ char SILowerControlFlow::ID = 0; INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE, "SI lower control flow", false, false) +static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) { + MachineOperand &ImpDefSCC = MI.getOperand(3); + assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef()); + + ImpDefSCC.setIsDead(IsDead); +} + char &llvm::SILowerControlFlowID = SILowerControlFlow::ID; void SILowerControlFlow::emitIf(MachineInstr &MI) { @@ -123,14 +136,36 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { unsigned SaveExecReg = SaveExec.getReg(); - MachineInstr *AndSaveExec = - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), SaveExecReg) - .addOperand(Cond); + MachineOperand &ImpDefSCC = MI.getOperand(4); + assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef()); + + // Add an implicit def of exec to discourage scheduling VALU after this which + // will interfere with trying to form s_and_saveexec_b64 later. + MachineInstr *CopyExec = + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), SaveExecReg) + .addReg(AMDGPU::EXEC) + .addReg(AMDGPU::EXEC, RegState::ImplicitDefine); + + unsigned Tmp = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + + MachineInstr *And = + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_B64), Tmp) + .addReg(SaveExecReg) + //.addReg(AMDGPU::EXEC) + .addReg(Cond.getReg()); + setImpSCCDefDead(*And, true); MachineInstr *Xor = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg) - .addReg(AMDGPU::EXEC) + .addReg(Tmp) .addReg(SaveExecReg); + setImpSCCDefDead(*Xor, ImpDefSCC.isDead()); + + // Use a copy that is a terminator to get correct spill code placement it with + // fast regalloc. + MachineInstr *SetExec = + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64_term), AMDGPU::EXEC) + .addReg(Tmp, RegState::Kill); // Insert a pseudo terminator to help keep the verifier happy. This will also // be used later when inserting skips. @@ -143,11 +178,17 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { return; } + LIS->InsertMachineInstrInMaps(*CopyExec); + + // Replace with and so we don't need to fix the live interval for condition + // register. + LIS->ReplaceMachineInstrInMaps(MI, *And); - LIS->ReplaceMachineInstrInMaps(MI, *AndSaveExec); LIS->InsertMachineInstrInMaps(*Xor); + LIS->InsertMachineInstrInMaps(*SetExec); LIS->InsertMachineInstrInMaps(*NewBr); + LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI)); MI.eraseFromParent(); // FIXME: Is there a better way of adjusting the liveness? It shouldn't be @@ -155,6 +196,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { // valno. LIS->removeInterval(SaveExecReg); LIS->createAndComputeVirtRegInterval(SaveExecReg); + LIS->createAndComputeVirtRegInterval(Tmp); } void SILowerControlFlow::emitElse(MachineInstr &MI) { @@ -167,11 +209,18 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { bool ExecModified = MI.getOperand(3).getImm() != 0; MachineBasicBlock::iterator Start = MBB.begin(); + // We are running before TwoAddressInstructions, and si_else's operands are + // tied. In order to correctly tie the registers, split this into a copy of + // the src like it does. + BuildMI(MBB, Start, DL, TII->get(AMDGPU::COPY), DstReg) + .addOperand(MI.getOperand(1)); // Saved EXEC + // This must be inserted before phis and any spill code inserted before the // else. MachineInstr *OrSaveExec = BuildMI(MBB, Start, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), DstReg) - .addOperand(MI.getOperand(1)); // Saved EXEC + .addReg(DstReg); + MachineBasicBlock *DestBB = MI.getOperand(2).getMBB(); MachineBasicBlock::iterator ElsePt(MI); @@ -187,14 +236,12 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { } MachineInstr *Xor = - BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) + BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC) .addReg(AMDGPU::EXEC) .addReg(DstReg); - MachineBasicBlock::iterator Term = MBB.getFirstTerminator(); - // Insert a pseudo terminator to help keep the verifier happy. MachineInstr *Branch = - BuildMI(MBB, Term, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) + BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) .addMBB(DestBB); if (!LIS) { @@ -246,7 +293,7 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) { const DebugLoc &DL = MI.getDebugLoc(); MachineInstr *AndN2 = - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC) + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64_term), AMDGPU::EXEC) .addReg(AMDGPU::EXEC) .addOperand(MI.getOperand(0)); @@ -288,6 +335,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { // This doesn't actually need LiveIntervals, but we can preserve them. LIS = getAnalysisIfAvailable<LiveIntervals>(); + MRI = &MF.getRegInfo(); MachineFunction::iterator NextBB; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp new file mode 100644 index 00000000000..b8994f6f949 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -0,0 +1,304 @@ +//===-- SIOptimizeExecMasking.cpp -----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-optimize-exec-masking" + +namespace { + +class SIOptimizeExecMasking : public MachineFunctionPass { +public: + static char ID; + +public: + SIOptimizeExecMasking() : MachineFunctionPass(ID) { + initializeSIOptimizeExecMaskingPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI optimize exec mask operations"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SIOptimizeExecMasking, DEBUG_TYPE, + "SI optimize exec mask operations", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_END(SIOptimizeExecMasking, DEBUG_TYPE, + "SI optimize exec mask operations", false, false) + +char SIOptimizeExecMasking::ID = 0; + +char &llvm::SIOptimizeExecMaskingID = SIOptimizeExecMasking::ID; + +/// If \p MI is a copy from exec, return the register copied to. +static unsigned isCopyFromExec(const MachineInstr &MI) { + switch (MI.getOpcode()) { + case AMDGPU::COPY: + case AMDGPU::S_MOV_B64: + case AMDGPU::S_MOV_B64_term: { + const MachineOperand &Src = MI.getOperand(1); + if (Src.isReg() && Src.getReg() == AMDGPU::EXEC) + return MI.getOperand(0).getReg(); + } + } + + return AMDGPU::NoRegister; +} + +/// If \p MI is a copy to exec, return the register copied from. +static unsigned isCopyToExec(const MachineInstr &MI) { + switch (MI.getOpcode()) { + case AMDGPU::COPY: + case AMDGPU::S_MOV_B64: { + const MachineOperand &Dst = MI.getOperand(0); + if (Dst.isReg() && Dst.getReg() == AMDGPU::EXEC) + return MI.getOperand(1).getReg(); + break; + } + case AMDGPU::S_MOV_B64_term: + llvm_unreachable("should have been replaced"); + } + + return AMDGPU::NoRegister; +} + +static unsigned getSaveExecOp(unsigned Opc) { + switch (Opc) { + case AMDGPU::S_AND_B64: + return AMDGPU::S_AND_SAVEEXEC_B64; + case AMDGPU::S_OR_B64: + return AMDGPU::S_OR_SAVEEXEC_B64; + case AMDGPU::S_XOR_B64: + return AMDGPU::S_XOR_SAVEEXEC_B64; + case AMDGPU::S_ANDN2_B64: + return AMDGPU::S_ANDN2_SAVEEXEC_B64; + case AMDGPU::S_ORN2_B64: + return AMDGPU::S_ORN2_SAVEEXEC_B64; + case AMDGPU::S_NAND_B64: + return AMDGPU::S_NAND_SAVEEXEC_B64; + case AMDGPU::S_NOR_B64: + return AMDGPU::S_NOR_SAVEEXEC_B64; + case AMDGPU::S_XNOR_B64: + return AMDGPU::S_XNOR_SAVEEXEC_B64; + default: + return AMDGPU::INSTRUCTION_LIST_END; + } +} + +// These are only terminators to get correct spill code placement during +// register allocation, so turn them back into normal instructions. Only one of +// these is expected per block. +static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) { + switch (MI.getOpcode()) { + case AMDGPU::S_MOV_B64_term: { + MI.setDesc(TII.get(AMDGPU::COPY)); + return true; + } + case AMDGPU::S_XOR_B64_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(TII.get(AMDGPU::S_XOR_B64)); + return true; + } + case AMDGPU::S_ANDN2_B64_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(TII.get(AMDGPU::S_ANDN2_B64)); + return true; + } + default: + return false; + } +} + +static MachineBasicBlock::reverse_iterator fixTerminators( + const SIInstrInfo &TII, + MachineBasicBlock &MBB) { + MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend(); + for (; I != E; ++I) { + if (!I->isTerminator()) + return I; + + if (removeTerminatorBit(TII, *I)) + return I; + } + + return E; +} + +static MachineBasicBlock::reverse_iterator findExecCopy( + const SIInstrInfo &TII, + MachineBasicBlock &MBB, + MachineBasicBlock::reverse_iterator I, + unsigned CopyToExec) { + const unsigned InstLimit = 25; + + auto E = MBB.rend(); + for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N) { + unsigned CopyFromExec = isCopyFromExec(*I); + if (CopyFromExec != AMDGPU::NoRegister) + return I; + } + + return E; +} + +// XXX - Seems LivePhysRegs doesn't work correctly since it will incorrectly +// repor tthe register as unavailable because a super-register with a lane mask +// as unavailable. +static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) { + for (MachineBasicBlock *Succ : MBB.successors()) { + if (Succ->isLiveIn(Reg)) + return true; + } + + return false; +} + +bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); + + // Optimize sequences emitted for control flow lowering. They are originally + // emitted as the separate operations because spill code may need to be + // inserted for the saved copy of exec. + // + // x = copy exec + // z = s_<op>_b64 x, y + // exec = copy z + // => + // x = s_<op>_saveexec_b64 y + // + + for (MachineBasicBlock &MBB : MF) { + MachineBasicBlock::reverse_iterator I = fixTerminators(*TII, MBB); + MachineBasicBlock::reverse_iterator E = MBB.rend(); + if (I == E) + continue; + + unsigned CopyToExec = isCopyToExec(*I); + if (CopyToExec == AMDGPU::NoRegister) + continue; + + // Scan backwards to find the def. + auto CopyToExecInst = &*I; + auto CopyFromExecInst = findExecCopy(*TII, MBB, I, CopyToExec); + if (CopyFromExecInst == E) + continue; + + if (isLiveOut(MBB, CopyToExec)) { + // The copied register is live out and has a second use in another block. + DEBUG(dbgs() << "Exec copy source register is live out\n"); + continue; + } + + unsigned CopyFromExec = CopyFromExecInst->getOperand(0).getReg(); + MachineInstr *SaveExecInst = nullptr; + SmallVector<MachineInstr *, 4> OtherUseInsts; + + for (MachineBasicBlock::iterator J + = std::next(CopyFromExecInst->getIterator()), JE = I->getIterator(); + J != JE; ++J) { + if (SaveExecInst && J->readsRegister(AMDGPU::EXEC, TRI)) { + DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n'); + // Make sure this is inserted after any VALU ops that may have been + // scheduled in between. + SaveExecInst = nullptr; + break; + } + + if (J->modifiesRegister(CopyToExec, TRI)) { + if (SaveExecInst) { + DEBUG(dbgs() << "Multiple instructions modify " + << PrintReg(CopyToExec, TRI) << '\n'); + SaveExecInst = nullptr; + break; + } + + unsigned SaveExecOp = getSaveExecOp(J->getOpcode()); + if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END) + break; + + if (J->readsRegister(CopyFromExec, TRI)) { + SaveExecInst = &*J; + DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n'); + } else { + DEBUG(dbgs() << "Instruction does not read exec copy: " << *J << '\n'); + break; + } + } + + if (SaveExecInst && J->readsRegister(CopyToExec, TRI)) + OtherUseInsts.push_back(&*J); + } + + if (!SaveExecInst) + continue; + + DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n'); + + MachineOperand &Src0 = SaveExecInst->getOperand(1); + MachineOperand &Src1 = SaveExecInst->getOperand(2); + + MachineOperand *CopyOp = nullptr; + MachineOperand *OtherOp = nullptr; + + if (Src0.isReg() && Src0.getReg() == CopyFromExec) { + CopyOp = &Src0; + OtherOp = &Src1; + } else if (Src1.isReg() && Src1.getReg() == CopyFromExec) { + if (!SaveExecInst->isCommutable()) + break; + + CopyOp = &Src1; + OtherOp = &Src0; + } else + llvm_unreachable("unexpected"); + + CopyFromExecInst->eraseFromParent(); + + auto InsPt = SaveExecInst->getIterator(); + const DebugLoc &DL = SaveExecInst->getDebugLoc(); + + BuildMI(MBB, InsPt, DL, TII->get(getSaveExecOp(SaveExecInst->getOpcode())), + CopyFromExec) + .addReg(OtherOp->getReg()); + SaveExecInst->eraseFromParent(); + + CopyToExecInst->eraseFromParent(); + + for (MachineInstr *OtherInst : OtherUseInsts) { + OtherInst->substituteRegister(CopyToExec, AMDGPU::EXEC, + AMDGPU::NoSubRegister, *TRI); + } + } + + return true; + +} |