diff options
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.h | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 12 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/CMakeLists.txt | 2 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIFixWWMLiveness.cpp | 417 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 17 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstructions.td | 7 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h | 5 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp | 221 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 4 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 21 |
10 files changed, 278 insertions, 434 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 3cb0419dcca..59dfacb8c9a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -51,7 +51,7 @@ FunctionPass *createSIOptimizeExecMaskingPreRAPass(); FunctionPass *createSIFixSGPRCopiesPass(); FunctionPass *createSIMemoryLegalizerPass(); FunctionPass *createSIInsertWaitcntsPass(); -FunctionPass *createSIFixWWMLivenessPass(); +FunctionPass *createSIPreAllocateWWMRegsPass(); FunctionPass *createSIFormMemoryClausesPass(); FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &); FunctionPass *createAMDGPUUseNativeCallsPass(); @@ -148,8 +148,8 @@ extern char &SIInsertSkipsPassID; void initializeSIOptimizeExecMaskingPass(PassRegistry &); extern char &SIOptimizeExecMaskingID; -void initializeSIFixWWMLivenessPass(PassRegistry &); -extern char &SIFixWWMLivenessID; +void initializeSIPreAllocateWWMRegsPass(PassRegistry &); +extern char &SIPreAllocateWWMRegsID; void initializeAMDGPUSimplifyLibCallsPass(PassRegistry &); extern char &AMDGPUSimplifyLibCallsID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 66f19b1c1d2..84fa5918447 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -208,7 +208,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeSIInsertSkipsPass(*PR); initializeSIMemoryLegalizerPass(*PR); initializeSIOptimizeExecMaskingPass(*PR); - initializeSIFixWWMLivenessPass(*PR); + initializeSIPreAllocateWWMRegsPass(*PR); initializeSIFormMemoryClausesPass(*PR); initializeAMDGPUUnifyDivergentExitNodesPass(*PR); initializeAMDGPUAAWrapperPassPass(*PR); @@ -879,9 +879,8 @@ void GCNPassConfig::addFastRegAlloc() { // SI_ELSE will introduce a copy of the tied operand source after the else. insertPass(&PHIEliminationID, &SILowerControlFlowID, false); - // This must be run after SILowerControlFlow, since it needs to use the - // machine-level CFG, but before register allocation. - insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false); + // This must be run just after RegisterCoalescing. + insertPass(&RegisterCoalescerID, &SIPreAllocateWWMRegsID, false); TargetPassConfig::addFastRegAlloc(); } @@ -899,9 +898,8 @@ void GCNPassConfig::addOptimizedRegAlloc() { // SI_ELSE will introduce a copy of the tied operand source after the else. insertPass(&PHIEliminationID, &SILowerControlFlowID, false); - // This must be run after SILowerControlFlow, since it needs to use the - // machine-level CFG, but before register allocation. - insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false); + // This must be run just after RegisterCoalescing. + insertPass(&RegisterCoalescerID, &SIPreAllocateWWMRegsID, false); TargetPassConfig::addOptimizedRegAlloc(); } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 109d95cec2c..019bf85e5d8 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -95,7 +95,7 @@ add_llvm_target(AMDGPUCodeGen SIFixSGPRCopies.cpp SIFixupVectorISel.cpp SIFixVGPRCopies.cpp - SIFixWWMLiveness.cpp + SIPreAllocateWWMRegs.cpp SIFoldOperands.cpp SIFormMemoryClauses.cpp SIFrameLowering.cpp diff --git a/llvm/lib/Target/AMDGPU/SIFixWWMLiveness.cpp b/llvm/lib/Target/AMDGPU/SIFixWWMLiveness.cpp deleted file mode 100644 index 01c76327aab..00000000000 --- a/llvm/lib/Target/AMDGPU/SIFixWWMLiveness.cpp +++ /dev/null @@ -1,417 +0,0 @@ -//===-- SIFixWWMLiveness.cpp - Fix WWM live intervals ---------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file -/// Computations in WWM can overwrite values in inactive channels for -/// variables that the register allocator thinks are dead. This pass adds fake -/// uses of those variables to their def(s) to make sure that they aren't -/// overwritten. -/// -/// As an example, consider this snippet: -/// %vgpr0 = V_MOV_B32_e32 0.0 -/// if (...) { -/// %vgpr1 = ... -/// %vgpr2 = WWM killed %vgpr1 -/// ... = killed %vgpr2 -/// %vgpr0 = V_MOV_B32_e32 1.0 -/// } -/// ... = %vgpr0 -/// -/// The live intervals of %vgpr0 don't overlap with those of %vgpr1. Normally, -/// we can safely allocate %vgpr0 and %vgpr1 in the same register, since -/// writing %vgpr1 would only write to channels that would be clobbered by the -/// second write to %vgpr0 anyways. But if %vgpr1 is written with WWM enabled, -/// it would clobber even the inactive channels for which the if-condition is -/// false, for which %vgpr0 is supposed to be 0. This pass adds an implicit use -/// of %vgpr0 to its def to make sure they aren't allocated to the -/// same register. -/// -/// In general, we need to figure out what registers might have their inactive -/// channels which are eventually used accidentally clobbered by a WWM -/// instruction. We do that by spotting three separate cases of registers: -/// -/// 1. A "then phi": the value resulting from phi elimination of a phi node at -/// the end of an if..endif. If there is WWM code in the "then", then we -/// make the def at the end of the "then" branch a partial def by adding an -/// implicit use of the register. -/// -/// 2. A "loop exit register": a value written inside a loop but used outside the -/// loop, where there is WWM code inside the loop (the case in the example -/// above). We add an implicit_def of the register in the loop pre-header, -/// and make the original def a partial def by adding an implicit use of the -/// register. -/// -/// 3. A "loop exit phi": the value resulting from phi elimination of a phi node -/// in a loop header. If there is WWM code inside the loop, then we make all -/// defs inside the loop partial defs by adding an implicit use of the -/// register on each one. -/// -/// Note that we do not need to consider an if..else..endif phi. We only need to -/// consider non-uniform control flow, and control flow structurization would -/// have transformed a non-uniform if..else..endif into two if..endifs. -/// -/// The analysis to detect these cases relies on a property of the MIR -/// arising from this pass running straight after PHIElimination and before any -/// coalescing: that any virtual register with more than one definition must be -/// the new register added to lower a phi node by PHIElimination. -/// -/// FIXME: We should detect whether a register in one of the above categories is -/// already live at the WWM code before deciding to add the implicit uses to -/// synthesize its liveness. -/// -/// FIXME: I believe this whole scheme may be flawed due to the possibility of -/// the register allocator doing live interval splitting. -/// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "SIInstrInfo.h" -#include "SIRegisterInfo.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/SparseBitVector.h" -#include "llvm/CodeGen/LiveIntervals.h" -#include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineLoopInfo.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/CodeGen/TargetRegisterInfo.h" - -using namespace llvm; - -#define DEBUG_TYPE "si-fix-wwm-liveness" - -namespace { - -class SIFixWWMLiveness : public MachineFunctionPass { -private: - MachineDominatorTree *DomTree; - MachineLoopInfo *LoopInfo; - LiveIntervals *LIS = nullptr; - const SIInstrInfo *TII; - const SIRegisterInfo *TRI; - MachineRegisterInfo *MRI; - - std::vector<MachineInstr *> WWMs; - std::vector<MachineOperand *> ThenDefs; - std::vector<std::pair<MachineOperand *, MachineLoop *>> LoopExitDefs; - std::vector<std::pair<MachineOperand *, MachineLoop *>> LoopPhiDefs; - -public: - static char ID; - - SIFixWWMLiveness() : MachineFunctionPass(ID) { - initializeSIFixWWMLivenessPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - StringRef getPassName() const override { return "SI Fix WWM Liveness"; } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequiredID(MachineDominatorsID); - AU.addRequiredID(MachineLoopInfoID); - // Should preserve the same set that TwoAddressInstructions does. - AU.addPreserved<SlotIndexes>(); - AU.addPreserved<LiveIntervals>(); - AU.addPreservedID(LiveVariablesID); - AU.addPreservedID(MachineLoopInfoID); - AU.addPreservedID(MachineDominatorsID); - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } - -private: - void processDef(MachineOperand &DefOpnd); - bool processThenDef(MachineOperand *DefOpnd); - bool processLoopExitDef(MachineOperand *DefOpnd, MachineLoop *Loop); - bool processLoopPhiDef(MachineOperand *DefOpnd, MachineLoop *Loop); -}; - -} // End anonymous namespace. - -INITIALIZE_PASS_BEGIN(SIFixWWMLiveness, DEBUG_TYPE, - "SI fix WWM liveness", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_END(SIFixWWMLiveness, DEBUG_TYPE, - "SI fix WWM liveness", false, false) - -char SIFixWWMLiveness::ID = 0; - -char &llvm::SIFixWWMLivenessID = SIFixWWMLiveness::ID; - -FunctionPass *llvm::createSIFixWWMLivenessPass() { - return new SIFixWWMLiveness(); -} - -bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) { - LLVM_DEBUG(dbgs() << "SIFixWWMLiveness: function " << MF.getName() << "\n"); - bool Modified = false; - - // This doesn't actually need LiveIntervals, but we can preserve them. - LIS = getAnalysisIfAvailable<LiveIntervals>(); - - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - - TII = ST.getInstrInfo(); - TRI = &TII->getRegisterInfo(); - MRI = &MF.getRegInfo(); - - DomTree = &getAnalysis<MachineDominatorTree>(); - LoopInfo = &getAnalysis<MachineLoopInfo>(); - - // Scan the function to find the WWM sections and the candidate registers for - // having liveness modified. - for (MachineBasicBlock &MBB : MF) { - for (MachineInstr &MI : MBB) { - if (MI.getOpcode() == AMDGPU::EXIT_WWM) - WWMs.push_back(&MI); - else { - for (MachineOperand &DefOpnd : MI.defs()) { - if (DefOpnd.isReg()) { - unsigned Reg = DefOpnd.getReg(); - if (TRI->isVGPR(*MRI, Reg)) - processDef(DefOpnd); - } - } - } - } - } - if (!WWMs.empty()) { - // Synthesize liveness over WWM sections as required. - for (auto ThenDef : ThenDefs) - Modified |= processThenDef(ThenDef); - for (auto LoopExitDef : LoopExitDefs) - Modified |= processLoopExitDef(LoopExitDef.first, LoopExitDef.second); - for (auto LoopPhiDef : LoopPhiDefs) - Modified |= processLoopPhiDef(LoopPhiDef.first, LoopPhiDef.second); - } - - WWMs.clear(); - ThenDefs.clear(); - LoopExitDefs.clear(); - LoopPhiDefs.clear(); - - return Modified; -} - -// During the function scan, process an operand that defines a VGPR. -// This categorizes the register and puts it in the appropriate list for later -// use when processing a WWM section. -void SIFixWWMLiveness::processDef(MachineOperand &DefOpnd) { - unsigned Reg = DefOpnd.getReg(); - // Get all the defining instructions. For convenience, make Defs[0] the def - // we are on now. - SmallVector<const MachineInstr *, 4> Defs; - Defs.push_back(DefOpnd.getParent()); - for (auto &MI : MRI->def_instructions(Reg)) { - if (&MI != DefOpnd.getParent()) - Defs.push_back(&MI); - } - // Check whether this def dominates all the others. If not, ignore this def. - // Either it is going to be processed when the scan encounters its other def - // that dominates all defs, or there is no def that dominates all others. - // The latter case is an eliminated phi from an if..else..endif or similar, - // which must be for uniform control flow so can be ignored. - // Because this pass runs shortly after PHIElimination, we assume that any - // multi-def register is a lowered phi, and thus has each def in a separate - // basic block. - for (unsigned I = 1; I != Defs.size(); ++I) { - if (!DomTree->dominates(Defs[0]->getParent(), Defs[I]->getParent())) - return; - } - // Check for the case of an if..endif lowered phi: It has two defs, one - // dominates the other, and there is a single use in a successor of the - // dominant def. - // Later we will spot any WWM code inside - // the "then" clause and turn the second def into a partial def so its - // liveness goes through the WWM code in the "then" clause. - if (Defs.size() == 2) { - auto DomDefBlock = Defs[0]->getParent(); - if (DomDefBlock->succ_size() == 2 && MRI->hasOneUse(Reg)) { - auto UseBlock = MRI->use_begin(Reg)->getParent()->getParent(); - for (auto Succ : DomDefBlock->successors()) { - if (Succ == UseBlock) { - LLVM_DEBUG(dbgs() << printReg(Reg, TRI) << " is a then phi reg\n"); - ThenDefs.push_back(&DefOpnd); - return; - } - } - } - } - // Check for the case of a non-lowered-phi register (single def) that exits - // a loop, that is, it has a use that is outside a loop that the def is - // inside. We find the outermost loop that the def is inside but a use is - // outside. Later we will spot any WWM code inside that loop and then make - // the def a partial def so its liveness goes round the loop and through the - // WWM code. - if (Defs.size() == 1) { - auto Loop = LoopInfo->getLoopFor(Defs[0]->getParent()); - if (!Loop) - return; - bool IsLoopExit = false; - for (auto &Use : MRI->use_instructions(Reg)) { - auto UseBlock = Use.getParent(); - if (Loop->contains(UseBlock)) - continue; - IsLoopExit = true; - while (auto Parent = Loop->getParentLoop()) { - if (Parent->contains(UseBlock)) - break; - Loop = Parent; - } - } - if (!IsLoopExit) - return; - LLVM_DEBUG(dbgs() << printReg(Reg, TRI) - << " is a loop exit reg with loop header at " - << "bb." << Loop->getHeader()->getNumber() << "\n"); - LoopExitDefs.push_back(std::pair<MachineOperand *, MachineLoop *>( - &DefOpnd, Loop)); - return; - } - // Check for the case of a lowered single-preheader-loop phi, that is, a - // multi-def register where the dominating def is in the loop pre-header and - // all other defs are in backedges. Later we will spot any WWM code inside - // that loop and then make the backedge defs partial defs so the liveness - // goes through the WWM code. - // Note that we are ignoring multi-preheader loops on the basis that the - // structurizer does not allow that for non-uniform loops. - // There must be a single use in the loop header. - if (!MRI->hasOneUse(Reg)) - return; - auto UseBlock = MRI->use_begin(Reg)->getParent()->getParent(); - auto Loop = LoopInfo->getLoopFor(UseBlock); - if (!Loop || Loop->getHeader() != UseBlock - || Loop->contains(Defs[0]->getParent())) { - LLVM_DEBUG(dbgs() << printReg(Reg, TRI) - << " is multi-def but single use not in loop header\n"); - return; - } - for (unsigned I = 1; I != Defs.size(); ++I) { - if (!Loop->contains(Defs[I]->getParent())) - return; - } - LLVM_DEBUG(dbgs() << printReg(Reg, TRI) - << " is a loop phi reg with loop header at " - << "bb." << Loop->getHeader()->getNumber() << "\n"); - LoopPhiDefs.push_back( - std::pair<MachineOperand *, MachineLoop *>(&DefOpnd, Loop)); -} - -// Process a then phi def: It has two defs, one dominates the other, and there -// is a single use in a successor of the dominant def. Here we spot any WWM -// code inside the "then" clause and turn the second def into a partial def so -// its liveness goes through the WWM code in the "then" clause. -bool SIFixWWMLiveness::processThenDef(MachineOperand *DefOpnd) { - LLVM_DEBUG(dbgs() << "Processing then def: " << *DefOpnd->getParent()); - if (DefOpnd->getParent()->getOpcode() == TargetOpcode::IMPLICIT_DEF) { - // Ignore if dominating def is undef. - LLVM_DEBUG(dbgs() << " ignoring as dominating def is undef\n"); - return false; - } - unsigned Reg = DefOpnd->getReg(); - // Get the use block, which is the endif block. - auto UseBlock = MRI->use_instr_begin(Reg)->getParent(); - // Check whether there is WWM code inside the then branch. The WWM code must - // be dominated by the if but not dominated by the endif. - bool ContainsWWM = false; - for (auto WWM : WWMs) { - if (DomTree->dominates(DefOpnd->getParent()->getParent(), WWM->getParent()) - && !DomTree->dominates(UseBlock, WWM->getParent())) { - LLVM_DEBUG(dbgs() << " contains WWM: " << *WWM); - ContainsWWM = true; - break; - } - } - if (!ContainsWWM) - return false; - // Get the other def. - MachineInstr *OtherDef = nullptr; - for (auto &MI : MRI->def_instructions(Reg)) { - if (&MI != DefOpnd->getParent()) - OtherDef = &MI; - } - // Make it a partial def. - OtherDef->addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true)); - LLVM_DEBUG(dbgs() << *OtherDef); - return true; -} - -// Process a loop exit def, that is, a register with a single use in a loop -// that has a use outside the loop. Here we spot any WWM code inside that loop -// and then make the def a partial def so its liveness goes round the loop and -// through the WWM code. -bool SIFixWWMLiveness::processLoopExitDef(MachineOperand *DefOpnd, - MachineLoop *Loop) { - LLVM_DEBUG(dbgs() << "Processing loop exit def: " << *DefOpnd->getParent()); - // Check whether there is WWM code inside the loop. - bool ContainsWWM = false; - for (auto WWM : WWMs) { - if (Loop->contains(WWM->getParent())) { - LLVM_DEBUG(dbgs() << " contains WWM: " << *WWM); - ContainsWWM = true; - break; - } - } - if (!ContainsWWM) - return false; - unsigned Reg = DefOpnd->getReg(); - // Add a new implicit_def in loop preheader(s). - for (auto Pred : Loop->getHeader()->predecessors()) { - if (!Loop->contains(Pred)) { - auto ImplicitDef = BuildMI(*Pred, Pred->getFirstTerminator(), DebugLoc(), - TII->get(TargetOpcode::IMPLICIT_DEF), Reg); - LLVM_DEBUG(dbgs() << *ImplicitDef); - (void)ImplicitDef; - } - } - // Make the original def partial. - DefOpnd->getParent()->addOperand(MachineOperand::CreateReg( - Reg, false, /*isImp=*/true)); - LLVM_DEBUG(dbgs() << *DefOpnd->getParent()); - return true; -} - -// Process a loop phi def, that is, a multi-def register where the dominating -// def is in the loop pre-header and all other defs are in backedges. Here we -// spot any WWM code inside that loop and then make the backedge defs partial -// defs so the liveness goes through the WWM code. -bool SIFixWWMLiveness::processLoopPhiDef(MachineOperand *DefOpnd, - MachineLoop *Loop) { - LLVM_DEBUG(dbgs() << "Processing loop phi def: " << *DefOpnd->getParent()); - // Check whether there is WWM code inside the loop. - bool ContainsWWM = false; - for (auto WWM : WWMs) { - if (Loop->contains(WWM->getParent())) { - LLVM_DEBUG(dbgs() << " contains WWM: " << *WWM); - ContainsWWM = true; - break; - } - } - if (!ContainsWWM) - return false; - unsigned Reg = DefOpnd->getReg(); - // Remove kill mark from uses. - for (auto &Use : MRI->use_operands(Reg)) - Use.setIsKill(false); - // Make all defs except the dominating one partial defs. - SmallVector<MachineInstr *, 4> Defs; - for (auto &Def : MRI->def_instructions(Reg)) - Defs.push_back(&Def); - for (auto Def : Defs) { - if (DefOpnd->getParent() == Def) - continue; - Def->addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true)); - LLVM_DEBUG(dbgs() << *Def); - } - return true; -} - diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 370b8cf2cfb..1196fe1512c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -457,7 +457,12 @@ bool SIInstrInfo::shouldClusterMemOps(MachineOperand &BaseOp1, const MachineRegisterInfo &MRI = FirstLdSt.getParent()->getParent()->getRegInfo(); - const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg()); + + const unsigned Reg = FirstDst->getReg(); + + const TargetRegisterClass *DstRC = TargetRegisterInfo::isVirtualRegister(Reg) + ? MRI.getRegClass(Reg) + : RI.getPhysRegClass(Reg); return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold; } @@ -1322,9 +1327,15 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.eraseFromParent(); break; } + case AMDGPU::ENTER_WWM: { + // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when + // WWM is entered. + MI.setDesc(get(AMDGPU::S_OR_SAVEEXEC_B64)); + break; + } case AMDGPU::EXIT_WWM: { - // This only gets its own opcode so that SIFixWWMLiveness can tell when WWM - // is exited. + // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when + // WWM is exited. MI.setDesc(get(AMDGPU::S_MOV_B64)); break; } diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index bad6c3d5656..f6978034d25 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -121,6 +121,13 @@ def WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] +def ENTER_WWM : SPseudoInstSI <(outs SReg_64:$sdst), (ins i64imm:$src0)> { + let Defs = [EXEC]; + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 0; +} + def EXIT_WWM : SPseudoInstSI <(outs SReg_64:$sdst), (ins SReg_64:$src0)> { let hasSideEffects = 0; let mayLoad = 0; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 29e2460ae79..7e09f41aa8d 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -22,6 +22,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/SparseBitVector.h" #include "llvm/CodeGen/MIRYamlMapping.h" #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/CodeGen/TargetInstrInfo.h" @@ -259,6 +260,10 @@ public: SGPRSpillVGPRCSR(unsigned V, Optional<int> F) : VGPR(V), FI(F) {} }; + SparseBitVector<> WWMReservedRegs; + + void ReserveWWMRegister(unsigned reg) { WWMReservedRegs.set(reg); } + private: // SGPR->VGPR spilling support. using SpillRegMask = std::pair<unsigned, unsigned>; diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp new file mode 100644 index 00000000000..f9bfe96f65c --- /dev/null +++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp @@ -0,0 +1,221 @@ +//===- SIPreAllocateWWMRegs.cpp - WWM Register Pre-allocation -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Pass to pre-allocated WWM registers +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIRegisterInfo.h" +#include "SIMachineFunctionInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/LiveRegMatrix.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/RegisterClassInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-pre-allocate-wwm-regs" + +namespace { + +class SIPreAllocateWWMRegs : public MachineFunctionPass { +private: + const SIInstrInfo *TII; + const SIRegisterInfo *TRI; + MachineRegisterInfo *MRI; + LiveIntervals *LIS; + LiveRegMatrix *Matrix; + VirtRegMap *VRM; + RegisterClassInfo RegClassInfo; + + std::vector<unsigned> RegsToRewrite; + +public: + static char ID; + + SIPreAllocateWWMRegs() : MachineFunctionPass(ID) { + initializeSIPreAllocateWWMRegsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LiveIntervals>(); + AU.addPreserved<LiveIntervals>(); + AU.addRequired<VirtRegMap>(); + AU.addRequired<LiveRegMatrix>(); + AU.addPreserved<SlotIndexes>(); + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + bool processDef(MachineOperand &MO); + void rewriteRegs(MachineFunction &MF); +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SIPreAllocateWWMRegs, DEBUG_TYPE, + "SI Pre-allocate WWM Registers", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(VirtRegMap) +INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) +INITIALIZE_PASS_END(SIPreAllocateWWMRegs, DEBUG_TYPE, + "SI Pre-allocate WWM Registers", false, false) + +char SIPreAllocateWWMRegs::ID = 0; + +char &llvm::SIPreAllocateWWMRegsID = SIPreAllocateWWMRegs::ID; + +FunctionPass *llvm::createSIPreAllocateWWMRegsPass() { + return new SIPreAllocateWWMRegs(); +} + +bool SIPreAllocateWWMRegs::processDef(MachineOperand &MO) { + if (!MO.isReg()) + return false; + + unsigned Reg = MO.getReg(); + + if (!TRI->isVGPR(*MRI, Reg)) + return false; + + if (TRI->isPhysicalRegister(Reg)) + return false; + + if (VRM->hasPhys(Reg)) + return false; + + LiveInterval &LI = LIS->getInterval(Reg); + + for (unsigned PhysReg : RegClassInfo.getOrder(MRI->getRegClass(Reg))) { + if (!MRI->isPhysRegUsed(PhysReg) && + Matrix->checkInterference(LI, PhysReg) == LiveRegMatrix::IK_Free) { + Matrix->assign(LI, PhysReg); + assert(PhysReg != 0); + RegsToRewrite.push_back(Reg); + return true; + } + } + + llvm_unreachable("physreg not found for WWM expression"); + return false; +} + +void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) { + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + for (MachineOperand &MO : MI.operands()) { + if (!MO.isReg()) + continue; + + const unsigned VirtReg = MO.getReg(); + if (TRI->isPhysicalRegister(VirtReg)) + continue; + + if (!VRM->hasPhys(VirtReg)) + continue; + + unsigned PhysReg = VRM->getPhys(VirtReg); + const unsigned SubReg = MO.getSubReg(); + if (SubReg != 0) { + PhysReg = TRI->getSubReg(PhysReg, SubReg); + MO.setSubReg(0); + } + + MO.setReg(PhysReg); + MO.setIsRenamable(false); + } + } + } + + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + for (unsigned Reg : RegsToRewrite) { + LIS->removeInterval(Reg); + + const unsigned PhysReg = VRM->getPhys(Reg); + assert(PhysReg != 0); + MFI->ReserveWWMRegister(PhysReg); + } + + RegsToRewrite.clear(); + + // Update the set of reserved registers to include WWM ones. + MRI->freezeReservedRegs(MF); +} + +bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) { + LLVM_DEBUG(dbgs() << "SIPreAllocateWWMRegs: function " << MF.getName() << "\n"); + + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + + TII = ST.getInstrInfo(); + TRI = &TII->getRegisterInfo(); + MRI = &MF.getRegInfo(); + + LIS = &getAnalysis<LiveIntervals>(); + Matrix = &getAnalysis<LiveRegMatrix>(); + VRM = &getAnalysis<VirtRegMap>(); + + RegClassInfo.runOnMachineFunction(MF); + + bool RegsAssigned = false; + + // We use a reverse post-order traversal of the control-flow graph to + // guarantee that we visit definitions in dominance order. Since WWM + // expressions are guaranteed to never involve phi nodes, and we can only + // escape WWM through the special WWM instruction, this means that this is a + // perfect elimination order, so we can never do any better. + ReversePostOrderTraversal<MachineFunction*> RPOT(&MF); + + for (MachineBasicBlock *MBB : RPOT) { + bool InWWM = false; + for (MachineInstr &MI : *MBB) { + if (MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32 || + MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64) + RegsAssigned |= processDef(MI.getOperand(0)); + + if (MI.getOpcode() == AMDGPU::ENTER_WWM) { + LLVM_DEBUG(dbgs() << "entering WWM region: " << MI << "\n"); + InWWM = true; + continue; + } + + if (MI.getOpcode() == AMDGPU::EXIT_WWM) { + LLVM_DEBUG(dbgs() << "exiting WWM region: " << MI << "\n"); + InWWM = false; + } + + if (!InWWM) + continue; + + LLVM_DEBUG(dbgs() << "processing " << MI << "\n"); + + for (MachineOperand &DefOpnd : MI.defs()) { + RegsAssigned |= processDef(DefOpnd); + } + } + } + + if (!RegsAssigned) + return false; + + rewriteRegs(MF); + return true; +} diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 8c8caaa9bc7..e3538ae6074 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -230,6 +230,10 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { assert(!isSubRegister(ScratchRSrcReg, FrameReg)); } + for (unsigned Reg : MFI->WWMReservedRegs) { + reserveRegisterTuples(Reserved, Reg); + } + return Reserved; } diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 03c0353390f..b7d96f0842f 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -656,8 +656,7 @@ void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB, MachineInstr *MI; assert(SaveOrig); - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_OR_SAVEEXEC_B64), - SaveOrig) + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig) .addImm(-1); LIS->InsertMachineInstrInMaps(*MI); } @@ -839,7 +838,23 @@ void SIWholeQuadMode::lowerCopyInstrs() { for (MachineInstr *MI : LowerToCopyInstrs) { for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--) MI->RemoveOperand(i); - MI->setDesc(TII->get(AMDGPU::COPY)); + + const unsigned Reg = MI->getOperand(0).getReg(); + + if (TRI->isVGPR(*MRI, Reg)) { + const TargetRegisterClass *regClass = + TargetRegisterInfo::isVirtualRegister(Reg) + ? MRI->getRegClass(Reg) + : TRI->getPhysRegClass(Reg); + + const unsigned MovOp = TII->getMovOpcode(regClass); + MI->setDesc(TII->get(MovOp)); + + // And make it implicitly depend on exec (like all VALU movs should do). + MI->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); + } else { + MI->setDesc(TII->get(AMDGPU::COPY)); + } } } |