summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.h6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/CMakeLists.txt2
-rw-r--r--llvm/lib/Target/AMDGPU/SIFixWWMLiveness.cpp417
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp17
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td7
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h5
-rw-r--r--llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp221
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp21
10 files changed, 278 insertions, 434 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 3cb0419dcca..59dfacb8c9a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -51,7 +51,7 @@ FunctionPass *createSIOptimizeExecMaskingPreRAPass();
FunctionPass *createSIFixSGPRCopiesPass();
FunctionPass *createSIMemoryLegalizerPass();
FunctionPass *createSIInsertWaitcntsPass();
-FunctionPass *createSIFixWWMLivenessPass();
+FunctionPass *createSIPreAllocateWWMRegsPass();
FunctionPass *createSIFormMemoryClausesPass();
FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &);
FunctionPass *createAMDGPUUseNativeCallsPass();
@@ -148,8 +148,8 @@ extern char &SIInsertSkipsPassID;
void initializeSIOptimizeExecMaskingPass(PassRegistry &);
extern char &SIOptimizeExecMaskingID;
-void initializeSIFixWWMLivenessPass(PassRegistry &);
-extern char &SIFixWWMLivenessID;
+void initializeSIPreAllocateWWMRegsPass(PassRegistry &);
+extern char &SIPreAllocateWWMRegsID;
void initializeAMDGPUSimplifyLibCallsPass(PassRegistry &);
extern char &AMDGPUSimplifyLibCallsID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 66f19b1c1d2..84fa5918447 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -208,7 +208,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeSIInsertSkipsPass(*PR);
initializeSIMemoryLegalizerPass(*PR);
initializeSIOptimizeExecMaskingPass(*PR);
- initializeSIFixWWMLivenessPass(*PR);
+ initializeSIPreAllocateWWMRegsPass(*PR);
initializeSIFormMemoryClausesPass(*PR);
initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
initializeAMDGPUAAWrapperPassPass(*PR);
@@ -879,9 +879,8 @@ void GCNPassConfig::addFastRegAlloc() {
// SI_ELSE will introduce a copy of the tied operand source after the else.
insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
- // This must be run after SILowerControlFlow, since it needs to use the
- // machine-level CFG, but before register allocation.
- insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
+ // This must be run just after RegisterCoalescing.
+ insertPass(&RegisterCoalescerID, &SIPreAllocateWWMRegsID, false);
TargetPassConfig::addFastRegAlloc();
}
@@ -899,9 +898,8 @@ void GCNPassConfig::addOptimizedRegAlloc() {
// SI_ELSE will introduce a copy of the tied operand source after the else.
insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
- // This must be run after SILowerControlFlow, since it needs to use the
- // machine-level CFG, but before register allocation.
- insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
+ // This must be run just after RegisterCoalescing.
+ insertPass(&RegisterCoalescerID, &SIPreAllocateWWMRegsID, false);
TargetPassConfig::addOptimizedRegAlloc();
}
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 109d95cec2c..019bf85e5d8 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -95,7 +95,7 @@ add_llvm_target(AMDGPUCodeGen
SIFixSGPRCopies.cpp
SIFixupVectorISel.cpp
SIFixVGPRCopies.cpp
- SIFixWWMLiveness.cpp
+ SIPreAllocateWWMRegs.cpp
SIFoldOperands.cpp
SIFormMemoryClauses.cpp
SIFrameLowering.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIFixWWMLiveness.cpp b/llvm/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
deleted file mode 100644
index 01c76327aab..00000000000
--- a/llvm/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
+++ /dev/null
@@ -1,417 +0,0 @@
-//===-- SIFixWWMLiveness.cpp - Fix WWM live intervals ---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// Computations in WWM can overwrite values in inactive channels for
-/// variables that the register allocator thinks are dead. This pass adds fake
-/// uses of those variables to their def(s) to make sure that they aren't
-/// overwritten.
-///
-/// As an example, consider this snippet:
-/// %vgpr0 = V_MOV_B32_e32 0.0
-/// if (...) {
-/// %vgpr1 = ...
-/// %vgpr2 = WWM killed %vgpr1
-/// ... = killed %vgpr2
-/// %vgpr0 = V_MOV_B32_e32 1.0
-/// }
-/// ... = %vgpr0
-///
-/// The live intervals of %vgpr0 don't overlap with those of %vgpr1. Normally,
-/// we can safely allocate %vgpr0 and %vgpr1 in the same register, since
-/// writing %vgpr1 would only write to channels that would be clobbered by the
-/// second write to %vgpr0 anyways. But if %vgpr1 is written with WWM enabled,
-/// it would clobber even the inactive channels for which the if-condition is
-/// false, for which %vgpr0 is supposed to be 0. This pass adds an implicit use
-/// of %vgpr0 to its def to make sure they aren't allocated to the
-/// same register.
-///
-/// In general, we need to figure out what registers might have their inactive
-/// channels which are eventually used accidentally clobbered by a WWM
-/// instruction. We do that by spotting three separate cases of registers:
-///
-/// 1. A "then phi": the value resulting from phi elimination of a phi node at
-/// the end of an if..endif. If there is WWM code in the "then", then we
-/// make the def at the end of the "then" branch a partial def by adding an
-/// implicit use of the register.
-///
-/// 2. A "loop exit register": a value written inside a loop but used outside the
-/// loop, where there is WWM code inside the loop (the case in the example
-/// above). We add an implicit_def of the register in the loop pre-header,
-/// and make the original def a partial def by adding an implicit use of the
-/// register.
-///
-/// 3. A "loop exit phi": the value resulting from phi elimination of a phi node
-/// in a loop header. If there is WWM code inside the loop, then we make all
-/// defs inside the loop partial defs by adding an implicit use of the
-/// register on each one.
-///
-/// Note that we do not need to consider an if..else..endif phi. We only need to
-/// consider non-uniform control flow, and control flow structurization would
-/// have transformed a non-uniform if..else..endif into two if..endifs.
-///
-/// The analysis to detect these cases relies on a property of the MIR
-/// arising from this pass running straight after PHIElimination and before any
-/// coalescing: that any virtual register with more than one definition must be
-/// the new register added to lower a phi node by PHIElimination.
-///
-/// FIXME: We should detect whether a register in one of the above categories is
-/// already live at the WWM code before deciding to add the implicit uses to
-/// synthesize its liveness.
-///
-/// FIXME: I believe this whole scheme may be flawed due to the possibility of
-/// the register allocator doing live interval splitting.
-///
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SparseBitVector.h"
-#include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "si-fix-wwm-liveness"
-
-namespace {
-
-class SIFixWWMLiveness : public MachineFunctionPass {
-private:
- MachineDominatorTree *DomTree;
- MachineLoopInfo *LoopInfo;
- LiveIntervals *LIS = nullptr;
- const SIInstrInfo *TII;
- const SIRegisterInfo *TRI;
- MachineRegisterInfo *MRI;
-
- std::vector<MachineInstr *> WWMs;
- std::vector<MachineOperand *> ThenDefs;
- std::vector<std::pair<MachineOperand *, MachineLoop *>> LoopExitDefs;
- std::vector<std::pair<MachineOperand *, MachineLoop *>> LoopPhiDefs;
-
-public:
- static char ID;
-
- SIFixWWMLiveness() : MachineFunctionPass(ID) {
- initializeSIFixWWMLivenessPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnMachineFunction(MachineFunction &MF) override;
-
- StringRef getPassName() const override { return "SI Fix WWM Liveness"; }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequiredID(MachineDominatorsID);
- AU.addRequiredID(MachineLoopInfoID);
- // Should preserve the same set that TwoAddressInstructions does.
- AU.addPreserved<SlotIndexes>();
- AU.addPreserved<LiveIntervals>();
- AU.addPreservedID(LiveVariablesID);
- AU.addPreservedID(MachineLoopInfoID);
- AU.addPreservedID(MachineDominatorsID);
- AU.setPreservesCFG();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
-
-private:
- void processDef(MachineOperand &DefOpnd);
- bool processThenDef(MachineOperand *DefOpnd);
- bool processLoopExitDef(MachineOperand *DefOpnd, MachineLoop *Loop);
- bool processLoopPhiDef(MachineOperand *DefOpnd, MachineLoop *Loop);
-};
-
-} // End anonymous namespace.
-
-INITIALIZE_PASS_BEGIN(SIFixWWMLiveness, DEBUG_TYPE,
- "SI fix WWM liveness", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_END(SIFixWWMLiveness, DEBUG_TYPE,
- "SI fix WWM liveness", false, false)
-
-char SIFixWWMLiveness::ID = 0;
-
-char &llvm::SIFixWWMLivenessID = SIFixWWMLiveness::ID;
-
-FunctionPass *llvm::createSIFixWWMLivenessPass() {
- return new SIFixWWMLiveness();
-}
-
-bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) {
- LLVM_DEBUG(dbgs() << "SIFixWWMLiveness: function " << MF.getName() << "\n");
- bool Modified = false;
-
- // This doesn't actually need LiveIntervals, but we can preserve them.
- LIS = getAnalysisIfAvailable<LiveIntervals>();
-
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-
- TII = ST.getInstrInfo();
- TRI = &TII->getRegisterInfo();
- MRI = &MF.getRegInfo();
-
- DomTree = &getAnalysis<MachineDominatorTree>();
- LoopInfo = &getAnalysis<MachineLoopInfo>();
-
- // Scan the function to find the WWM sections and the candidate registers for
- // having liveness modified.
- for (MachineBasicBlock &MBB : MF) {
- for (MachineInstr &MI : MBB) {
- if (MI.getOpcode() == AMDGPU::EXIT_WWM)
- WWMs.push_back(&MI);
- else {
- for (MachineOperand &DefOpnd : MI.defs()) {
- if (DefOpnd.isReg()) {
- unsigned Reg = DefOpnd.getReg();
- if (TRI->isVGPR(*MRI, Reg))
- processDef(DefOpnd);
- }
- }
- }
- }
- }
- if (!WWMs.empty()) {
- // Synthesize liveness over WWM sections as required.
- for (auto ThenDef : ThenDefs)
- Modified |= processThenDef(ThenDef);
- for (auto LoopExitDef : LoopExitDefs)
- Modified |= processLoopExitDef(LoopExitDef.first, LoopExitDef.second);
- for (auto LoopPhiDef : LoopPhiDefs)
- Modified |= processLoopPhiDef(LoopPhiDef.first, LoopPhiDef.second);
- }
-
- WWMs.clear();
- ThenDefs.clear();
- LoopExitDefs.clear();
- LoopPhiDefs.clear();
-
- return Modified;
-}
-
-// During the function scan, process an operand that defines a VGPR.
-// This categorizes the register and puts it in the appropriate list for later
-// use when processing a WWM section.
-void SIFixWWMLiveness::processDef(MachineOperand &DefOpnd) {
- unsigned Reg = DefOpnd.getReg();
- // Get all the defining instructions. For convenience, make Defs[0] the def
- // we are on now.
- SmallVector<const MachineInstr *, 4> Defs;
- Defs.push_back(DefOpnd.getParent());
- for (auto &MI : MRI->def_instructions(Reg)) {
- if (&MI != DefOpnd.getParent())
- Defs.push_back(&MI);
- }
- // Check whether this def dominates all the others. If not, ignore this def.
- // Either it is going to be processed when the scan encounters its other def
- // that dominates all defs, or there is no def that dominates all others.
- // The latter case is an eliminated phi from an if..else..endif or similar,
- // which must be for uniform control flow so can be ignored.
- // Because this pass runs shortly after PHIElimination, we assume that any
- // multi-def register is a lowered phi, and thus has each def in a separate
- // basic block.
- for (unsigned I = 1; I != Defs.size(); ++I) {
- if (!DomTree->dominates(Defs[0]->getParent(), Defs[I]->getParent()))
- return;
- }
- // Check for the case of an if..endif lowered phi: It has two defs, one
- // dominates the other, and there is a single use in a successor of the
- // dominant def.
- // Later we will spot any WWM code inside
- // the "then" clause and turn the second def into a partial def so its
- // liveness goes through the WWM code in the "then" clause.
- if (Defs.size() == 2) {
- auto DomDefBlock = Defs[0]->getParent();
- if (DomDefBlock->succ_size() == 2 && MRI->hasOneUse(Reg)) {
- auto UseBlock = MRI->use_begin(Reg)->getParent()->getParent();
- for (auto Succ : DomDefBlock->successors()) {
- if (Succ == UseBlock) {
- LLVM_DEBUG(dbgs() << printReg(Reg, TRI) << " is a then phi reg\n");
- ThenDefs.push_back(&DefOpnd);
- return;
- }
- }
- }
- }
- // Check for the case of a non-lowered-phi register (single def) that exits
- // a loop, that is, it has a use that is outside a loop that the def is
- // inside. We find the outermost loop that the def is inside but a use is
- // outside. Later we will spot any WWM code inside that loop and then make
- // the def a partial def so its liveness goes round the loop and through the
- // WWM code.
- if (Defs.size() == 1) {
- auto Loop = LoopInfo->getLoopFor(Defs[0]->getParent());
- if (!Loop)
- return;
- bool IsLoopExit = false;
- for (auto &Use : MRI->use_instructions(Reg)) {
- auto UseBlock = Use.getParent();
- if (Loop->contains(UseBlock))
- continue;
- IsLoopExit = true;
- while (auto Parent = Loop->getParentLoop()) {
- if (Parent->contains(UseBlock))
- break;
- Loop = Parent;
- }
- }
- if (!IsLoopExit)
- return;
- LLVM_DEBUG(dbgs() << printReg(Reg, TRI)
- << " is a loop exit reg with loop header at "
- << "bb." << Loop->getHeader()->getNumber() << "\n");
- LoopExitDefs.push_back(std::pair<MachineOperand *, MachineLoop *>(
- &DefOpnd, Loop));
- return;
- }
- // Check for the case of a lowered single-preheader-loop phi, that is, a
- // multi-def register where the dominating def is in the loop pre-header and
- // all other defs are in backedges. Later we will spot any WWM code inside
- // that loop and then make the backedge defs partial defs so the liveness
- // goes through the WWM code.
- // Note that we are ignoring multi-preheader loops on the basis that the
- // structurizer does not allow that for non-uniform loops.
- // There must be a single use in the loop header.
- if (!MRI->hasOneUse(Reg))
- return;
- auto UseBlock = MRI->use_begin(Reg)->getParent()->getParent();
- auto Loop = LoopInfo->getLoopFor(UseBlock);
- if (!Loop || Loop->getHeader() != UseBlock
- || Loop->contains(Defs[0]->getParent())) {
- LLVM_DEBUG(dbgs() << printReg(Reg, TRI)
- << " is multi-def but single use not in loop header\n");
- return;
- }
- for (unsigned I = 1; I != Defs.size(); ++I) {
- if (!Loop->contains(Defs[I]->getParent()))
- return;
- }
- LLVM_DEBUG(dbgs() << printReg(Reg, TRI)
- << " is a loop phi reg with loop header at "
- << "bb." << Loop->getHeader()->getNumber() << "\n");
- LoopPhiDefs.push_back(
- std::pair<MachineOperand *, MachineLoop *>(&DefOpnd, Loop));
-}
-
-// Process a then phi def: It has two defs, one dominates the other, and there
-// is a single use in a successor of the dominant def. Here we spot any WWM
-// code inside the "then" clause and turn the second def into a partial def so
-// its liveness goes through the WWM code in the "then" clause.
-bool SIFixWWMLiveness::processThenDef(MachineOperand *DefOpnd) {
- LLVM_DEBUG(dbgs() << "Processing then def: " << *DefOpnd->getParent());
- if (DefOpnd->getParent()->getOpcode() == TargetOpcode::IMPLICIT_DEF) {
- // Ignore if dominating def is undef.
- LLVM_DEBUG(dbgs() << " ignoring as dominating def is undef\n");
- return false;
- }
- unsigned Reg = DefOpnd->getReg();
- // Get the use block, which is the endif block.
- auto UseBlock = MRI->use_instr_begin(Reg)->getParent();
- // Check whether there is WWM code inside the then branch. The WWM code must
- // be dominated by the if but not dominated by the endif.
- bool ContainsWWM = false;
- for (auto WWM : WWMs) {
- if (DomTree->dominates(DefOpnd->getParent()->getParent(), WWM->getParent())
- && !DomTree->dominates(UseBlock, WWM->getParent())) {
- LLVM_DEBUG(dbgs() << " contains WWM: " << *WWM);
- ContainsWWM = true;
- break;
- }
- }
- if (!ContainsWWM)
- return false;
- // Get the other def.
- MachineInstr *OtherDef = nullptr;
- for (auto &MI : MRI->def_instructions(Reg)) {
- if (&MI != DefOpnd->getParent())
- OtherDef = &MI;
- }
- // Make it a partial def.
- OtherDef->addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true));
- LLVM_DEBUG(dbgs() << *OtherDef);
- return true;
-}
-
-// Process a loop exit def, that is, a register with a single use in a loop
-// that has a use outside the loop. Here we spot any WWM code inside that loop
-// and then make the def a partial def so its liveness goes round the loop and
-// through the WWM code.
-bool SIFixWWMLiveness::processLoopExitDef(MachineOperand *DefOpnd,
- MachineLoop *Loop) {
- LLVM_DEBUG(dbgs() << "Processing loop exit def: " << *DefOpnd->getParent());
- // Check whether there is WWM code inside the loop.
- bool ContainsWWM = false;
- for (auto WWM : WWMs) {
- if (Loop->contains(WWM->getParent())) {
- LLVM_DEBUG(dbgs() << " contains WWM: " << *WWM);
- ContainsWWM = true;
- break;
- }
- }
- if (!ContainsWWM)
- return false;
- unsigned Reg = DefOpnd->getReg();
- // Add a new implicit_def in loop preheader(s).
- for (auto Pred : Loop->getHeader()->predecessors()) {
- if (!Loop->contains(Pred)) {
- auto ImplicitDef = BuildMI(*Pred, Pred->getFirstTerminator(), DebugLoc(),
- TII->get(TargetOpcode::IMPLICIT_DEF), Reg);
- LLVM_DEBUG(dbgs() << *ImplicitDef);
- (void)ImplicitDef;
- }
- }
- // Make the original def partial.
- DefOpnd->getParent()->addOperand(MachineOperand::CreateReg(
- Reg, false, /*isImp=*/true));
- LLVM_DEBUG(dbgs() << *DefOpnd->getParent());
- return true;
-}
-
-// Process a loop phi def, that is, a multi-def register where the dominating
-// def is in the loop pre-header and all other defs are in backedges. Here we
-// spot any WWM code inside that loop and then make the backedge defs partial
-// defs so the liveness goes through the WWM code.
-bool SIFixWWMLiveness::processLoopPhiDef(MachineOperand *DefOpnd,
- MachineLoop *Loop) {
- LLVM_DEBUG(dbgs() << "Processing loop phi def: " << *DefOpnd->getParent());
- // Check whether there is WWM code inside the loop.
- bool ContainsWWM = false;
- for (auto WWM : WWMs) {
- if (Loop->contains(WWM->getParent())) {
- LLVM_DEBUG(dbgs() << " contains WWM: " << *WWM);
- ContainsWWM = true;
- break;
- }
- }
- if (!ContainsWWM)
- return false;
- unsigned Reg = DefOpnd->getReg();
- // Remove kill mark from uses.
- for (auto &Use : MRI->use_operands(Reg))
- Use.setIsKill(false);
- // Make all defs except the dominating one partial defs.
- SmallVector<MachineInstr *, 4> Defs;
- for (auto &Def : MRI->def_instructions(Reg))
- Defs.push_back(&Def);
- for (auto Def : Defs) {
- if (DefOpnd->getParent() == Def)
- continue;
- Def->addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true));
- LLVM_DEBUG(dbgs() << *Def);
- }
- return true;
-}
-
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 370b8cf2cfb..1196fe1512c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -457,7 +457,12 @@ bool SIInstrInfo::shouldClusterMemOps(MachineOperand &BaseOp1,
const MachineRegisterInfo &MRI =
FirstLdSt.getParent()->getParent()->getRegInfo();
- const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg());
+
+ const unsigned Reg = FirstDst->getReg();
+
+ const TargetRegisterClass *DstRC = TargetRegisterInfo::isVirtualRegister(Reg)
+ ? MRI.getRegClass(Reg)
+ : RI.getPhysRegClass(Reg);
return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
}
@@ -1322,9 +1327,15 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.eraseFromParent();
break;
}
+ case AMDGPU::ENTER_WWM: {
+ // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
+ // WWM is entered.
+ MI.setDesc(get(AMDGPU::S_OR_SAVEEXEC_B64));
+ break;
+ }
case AMDGPU::EXIT_WWM: {
- // This only gets its own opcode so that SIFixWWMLiveness can tell when WWM
- // is exited.
+ // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
+ // WWM is exited.
MI.setDesc(get(AMDGPU::S_MOV_B64));
break;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index bad6c3d5656..f6978034d25 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -121,6 +121,13 @@ def WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
+def ENTER_WWM : SPseudoInstSI <(outs SReg_64:$sdst), (ins i64imm:$src0)> {
+ let Defs = [EXEC];
+ let hasSideEffects = 0;
+ let mayLoad = 0;
+ let mayStore = 0;
+}
+
def EXIT_WWM : SPseudoInstSI <(outs SReg_64:$sdst), (ins SReg_64:$src0)> {
let hasSideEffects = 0;
let mayLoad = 0;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 29e2460ae79..7e09f41aa8d 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -22,6 +22,7 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseBitVector.h"
#include "llvm/CodeGen/MIRYamlMapping.h"
#include "llvm/CodeGen/PseudoSourceValue.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
@@ -259,6 +260,10 @@ public:
SGPRSpillVGPRCSR(unsigned V, Optional<int> F) : VGPR(V), FI(F) {}
};
+ SparseBitVector<> WWMReservedRegs;
+
+ void ReserveWWMRegister(unsigned reg) { WWMReservedRegs.set(reg); }
+
private:
// SGPR->VGPR spilling support.
using SpillRegMask = std::pair<unsigned, unsigned>;
diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
new file mode 100644
index 00000000000..f9bfe96f65c
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -0,0 +1,221 @@
+//===- SIPreAllocateWWMRegs.cpp - WWM Register Pre-allocation -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Pass to pre-allocated WWM registers
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-pre-allocate-wwm-regs"
+
+namespace {
+
+class SIPreAllocateWWMRegs : public MachineFunctionPass {
+private:
+ const SIInstrInfo *TII;
+ const SIRegisterInfo *TRI;
+ MachineRegisterInfo *MRI;
+ LiveIntervals *LIS;
+ LiveRegMatrix *Matrix;
+ VirtRegMap *VRM;
+ RegisterClassInfo RegClassInfo;
+
+ std::vector<unsigned> RegsToRewrite;
+
+public:
+ static char ID;
+
+ SIPreAllocateWWMRegs() : MachineFunctionPass(ID) {
+ initializeSIPreAllocateWWMRegsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LiveIntervals>();
+ AU.addPreserved<LiveIntervals>();
+ AU.addRequired<VirtRegMap>();
+ AU.addRequired<LiveRegMatrix>();
+ AU.addPreserved<SlotIndexes>();
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+private:
+ bool processDef(MachineOperand &MO);
+ void rewriteRegs(MachineFunction &MF);
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SIPreAllocateWWMRegs, DEBUG_TYPE,
+ "SI Pre-allocate WWM Registers", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
+INITIALIZE_PASS_END(SIPreAllocateWWMRegs, DEBUG_TYPE,
+ "SI Pre-allocate WWM Registers", false, false)
+
+char SIPreAllocateWWMRegs::ID = 0;
+
+char &llvm::SIPreAllocateWWMRegsID = SIPreAllocateWWMRegs::ID;
+
+FunctionPass *llvm::createSIPreAllocateWWMRegsPass() {
+ return new SIPreAllocateWWMRegs();
+}
+
+bool SIPreAllocateWWMRegs::processDef(MachineOperand &MO) {
+ if (!MO.isReg())
+ return false;
+
+ unsigned Reg = MO.getReg();
+
+ if (!TRI->isVGPR(*MRI, Reg))
+ return false;
+
+ if (TRI->isPhysicalRegister(Reg))
+ return false;
+
+ if (VRM->hasPhys(Reg))
+ return false;
+
+ LiveInterval &LI = LIS->getInterval(Reg);
+
+ for (unsigned PhysReg : RegClassInfo.getOrder(MRI->getRegClass(Reg))) {
+ if (!MRI->isPhysRegUsed(PhysReg) &&
+ Matrix->checkInterference(LI, PhysReg) == LiveRegMatrix::IK_Free) {
+ Matrix->assign(LI, PhysReg);
+ assert(PhysReg != 0);
+ RegsToRewrite.push_back(Reg);
+ return true;
+ }
+ }
+
+ llvm_unreachable("physreg not found for WWM expression");
+ return false;
+}
+
+void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ for (MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg())
+ continue;
+
+ const unsigned VirtReg = MO.getReg();
+ if (TRI->isPhysicalRegister(VirtReg))
+ continue;
+
+ if (!VRM->hasPhys(VirtReg))
+ continue;
+
+ unsigned PhysReg = VRM->getPhys(VirtReg);
+ const unsigned SubReg = MO.getSubReg();
+ if (SubReg != 0) {
+ PhysReg = TRI->getSubReg(PhysReg, SubReg);
+ MO.setSubReg(0);
+ }
+
+ MO.setReg(PhysReg);
+ MO.setIsRenamable(false);
+ }
+ }
+ }
+
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ for (unsigned Reg : RegsToRewrite) {
+ LIS->removeInterval(Reg);
+
+ const unsigned PhysReg = VRM->getPhys(Reg);
+ assert(PhysReg != 0);
+ MFI->ReserveWWMRegister(PhysReg);
+ }
+
+ RegsToRewrite.clear();
+
+ // Update the set of reserved registers to include WWM ones.
+ MRI->freezeReservedRegs(MF);
+}
+
+bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
+ LLVM_DEBUG(dbgs() << "SIPreAllocateWWMRegs: function " << MF.getName() << "\n");
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+
+ TII = ST.getInstrInfo();
+ TRI = &TII->getRegisterInfo();
+ MRI = &MF.getRegInfo();
+
+ LIS = &getAnalysis<LiveIntervals>();
+ Matrix = &getAnalysis<LiveRegMatrix>();
+ VRM = &getAnalysis<VirtRegMap>();
+
+ RegClassInfo.runOnMachineFunction(MF);
+
+ bool RegsAssigned = false;
+
+ // We use a reverse post-order traversal of the control-flow graph to
+ // guarantee that we visit definitions in dominance order. Since WWM
+ // expressions are guaranteed to never involve phi nodes, and we can only
+ // escape WWM through the special WWM instruction, this means that this is a
+ // perfect elimination order, so we can never do any better.
+ ReversePostOrderTraversal<MachineFunction*> RPOT(&MF);
+
+ for (MachineBasicBlock *MBB : RPOT) {
+ bool InWWM = false;
+ for (MachineInstr &MI : *MBB) {
+ if (MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
+ MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64)
+ RegsAssigned |= processDef(MI.getOperand(0));
+
+ if (MI.getOpcode() == AMDGPU::ENTER_WWM) {
+ LLVM_DEBUG(dbgs() << "entering WWM region: " << MI << "\n");
+ InWWM = true;
+ continue;
+ }
+
+ if (MI.getOpcode() == AMDGPU::EXIT_WWM) {
+ LLVM_DEBUG(dbgs() << "exiting WWM region: " << MI << "\n");
+ InWWM = false;
+ }
+
+ if (!InWWM)
+ continue;
+
+ LLVM_DEBUG(dbgs() << "processing " << MI << "\n");
+
+ for (MachineOperand &DefOpnd : MI.defs()) {
+ RegsAssigned |= processDef(DefOpnd);
+ }
+ }
+ }
+
+ if (!RegsAssigned)
+ return false;
+
+ rewriteRegs(MF);
+ return true;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 8c8caaa9bc7..e3538ae6074 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -230,6 +230,10 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
assert(!isSubRegister(ScratchRSrcReg, FrameReg));
}
+ for (unsigned Reg : MFI->WWMReservedRegs) {
+ reserveRegisterTuples(Reserved, Reg);
+ }
+
return Reserved;
}
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 03c0353390f..b7d96f0842f 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -656,8 +656,7 @@ void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
MachineInstr *MI;
assert(SaveOrig);
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_OR_SAVEEXEC_B64),
- SaveOrig)
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig)
.addImm(-1);
LIS->InsertMachineInstrInMaps(*MI);
}
@@ -839,7 +838,23 @@ void SIWholeQuadMode::lowerCopyInstrs() {
for (MachineInstr *MI : LowerToCopyInstrs) {
for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--)
MI->RemoveOperand(i);
- MI->setDesc(TII->get(AMDGPU::COPY));
+
+ const unsigned Reg = MI->getOperand(0).getReg();
+
+ if (TRI->isVGPR(*MRI, Reg)) {
+ const TargetRegisterClass *regClass =
+ TargetRegisterInfo::isVirtualRegister(Reg)
+ ? MRI->getRegClass(Reg)
+ : TRI->getPhysRegClass(Reg);
+
+ const unsigned MovOp = TII->getMovOpcode(regClass);
+ MI->setDesc(TII->get(MovOp));
+
+ // And make it implicitly depend on exec (like all VALU movs should do).
+ MI->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
+ } else {
+ MI->setDesc(TII->get(AMDGPU::COPY));
+ }
}
}
OpenPOWER on IntegriCloud