summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Target/AMDGPU/SIFixWWMLiveness.cpp385
-rw-r--r--llvm/test/CodeGen/AMDGPU/fix-wwm-liveness.mir116
-rw-r--r--llvm/test/CodeGen/AMDGPU/wqm.ll39
3 files changed, 451 insertions, 89 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIFixWWMLiveness.cpp b/llvm/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
index 5d613d8874f..7761418c533 100644
--- a/llvm/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
@@ -10,7 +10,7 @@
/// \file
/// Computations in WWM can overwrite values in inactive channels for
/// variables that the register allocator thinks are dead. This pass adds fake
-/// uses of those variables to WWM instructions to make sure that they aren't
+/// uses of those variables to their def(s) to make sure that they aren't
/// overwritten.
///
/// As an example, consider this snippet:
@@ -29,25 +29,44 @@
/// second write to %vgpr0 anyways. But if %vgpr1 is written with WWM enabled,
/// it would clobber even the inactive channels for which the if-condition is
/// false, for which %vgpr0 is supposed to be 0. This pass adds an implicit use
-/// of %vgpr0 to the WWM instruction to make sure they aren't allocated to the
+/// of %vgpr0 to its def to make sure they aren't allocated to the
/// same register.
///
/// In general, we need to figure out what registers might have their inactive
/// channels which are eventually used accidentally clobbered by a WWM
-/// instruction. We approximate this using two conditions:
+/// instruction. We do that by spotting three separate cases of registers:
///
-/// 1. A definition of the variable reaches the WWM instruction.
-/// 2. The variable would be live at the WWM instruction if all its defs were
-/// partial defs (i.e. considered as a use), ignoring normal uses.
+/// 1. A "then phi": the value resulting from phi elimination of a phi node at
+/// the end of an if..endif. If there is WWM code in the "then", then we
+/// make the def at the end of the "then" branch a partial def by adding an
+/// implicit use of the register.
///
-/// If a register matches both conditions, then we add an implicit use of it to
-/// the WWM instruction. Condition #2 is the heart of the matter: every
-/// definition is really a partial definition, since every VALU instruction is
-/// implicitly predicated. We can usually ignore this, but WWM forces us not
-/// to. Condition #1 prevents false positives if the variable is undefined at
-/// the WWM instruction anyways. This is overly conservative in certain cases,
-/// especially in uniform control flow, but this is a workaround anyways until
-/// LLVM gains the notion of predicated uses and definitions of variables.
+/// 2. A "loop exit register": a value written inside a loop but used outside the
+/// loop, where there is WWM code inside the loop (the case in the example
+/// above). We add an implicit_def of the register in the loop pre-header,
+/// and make the original def a partial def by adding an implicit use of the
+/// register.
+///
+/// 3. A "loop exit phi": the value resulting from phi elimination of a phi node
+/// in a loop header. If there is WWM code inside the loop, then we make all
+/// defs inside the loop partial defs by adding an implicit use of the
+/// register on each one.
+///
+/// Note that we do not need to consider an if..else..endif phi. We only need to
+/// consider non-uniform control flow, and control flow structurization would
+/// have transformed a non-uniform if..else..endif into two if..endifs.
+///
+/// The analysis to detect these cases relies on a property of the MIR
+/// arising from this pass running straight after PHIElimination and before any
+/// coalescing: that any virtual register with more than one definition must be
+/// the new register added to lower a phi node by PHIElimination.
+///
+/// FIXME: We should detect whether a register in one of the above categories is
+/// already live at the WWM code before deciding to add the implicit uses to
+/// synthesize its liveness.
+///
+/// FIXME: I believe this whole scheme may be flawed due to the possibility of
+/// the register allocator doing live interval splitting.
///
//===----------------------------------------------------------------------===//
@@ -59,7 +78,9 @@
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/SparseBitVector.h"
#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -71,10 +92,18 @@ namespace {
class SIFixWWMLiveness : public MachineFunctionPass {
private:
+ MachineDominatorTree *DomTree;
+ MachineLoopInfo *LoopInfo;
LiveIntervals *LIS = nullptr;
+ const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
MachineRegisterInfo *MRI;
+ std::vector<MachineInstr *> WWMs;
+ std::vector<MachineOperand *> ThenDefs;
+ std::vector<std::pair<MachineOperand *, MachineLoop *>> LoopExitDefs;
+ std::vector<std::pair<MachineOperand *, MachineLoop *>> LoopPhiDefs;
+
public:
static char ID;
@@ -84,13 +113,11 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
- bool runOnWWMInstruction(MachineInstr &MI);
-
- void addDefs(const MachineInstr &MI, SparseBitVector<> &set);
-
StringRef getPassName() const override { return "SI Fix WWM Liveness"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequiredID(MachineDominatorsID);
+ AU.addRequiredID(MachineLoopInfoID);
// Should preserve the same set that TwoAddressInstructions does.
AU.addPreserved<SlotIndexes>();
AU.addPreserved<LiveIntervals>();
@@ -100,11 +127,21 @@ public:
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
+
+private:
+ void processDef(MachineOperand &DefOpnd);
+ bool processThenDef(MachineOperand *DefOpnd);
+ bool processLoopExitDef(MachineOperand *DefOpnd, MachineLoop *Loop);
+ bool processLoopPhiDef(MachineOperand *DefOpnd, MachineLoop *Loop);
};
} // End anonymous namespace.
-INITIALIZE_PASS(SIFixWWMLiveness, DEBUG_TYPE,
+INITIALIZE_PASS_BEGIN(SIFixWWMLiveness, DEBUG_TYPE,
+ "SI fix WWM liveness", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(SIFixWWMLiveness, DEBUG_TYPE,
"SI fix WWM liveness", false, false)
char SIFixWWMLiveness::ID = 0;
@@ -115,89 +152,267 @@ FunctionPass *llvm::createSIFixWWMLivenessPass() {
return new SIFixWWMLiveness();
}
-void SIFixWWMLiveness::addDefs(const MachineInstr &MI, SparseBitVector<> &Regs)
-{
- for (const MachineOperand &Op : MI.defs()) {
- if (Op.isReg()) {
- unsigned Reg = Op.getReg();
- if (TRI->isVGPR(*MRI, Reg))
- Regs.set(Reg);
- }
- }
-}
+bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) {
+ LLVM_DEBUG(dbgs() << "SIFixWWMLiveness: function " << MF.getName() << "\n");
+ bool Modified = false;
+
+ // This doesn't actually need LiveIntervals, but we can preserve them.
+ LIS = getAnalysisIfAvailable<LiveIntervals>();
-bool SIFixWWMLiveness::runOnWWMInstruction(MachineInstr &WWM) {
- MachineBasicBlock *MBB = WWM.getParent();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- // Compute the registers that are live out of MI by figuring out which defs
- // are reachable from MI.
- SparseBitVector<> LiveOut;
+ TII = ST.getInstrInfo();
+ TRI = &TII->getRegisterInfo();
+ MRI = &MF.getRegInfo();
- for (auto II = MachineBasicBlock::iterator(WWM), IE =
- MBB->end(); II != IE; ++II) {
- addDefs(*II, LiveOut);
- }
+ DomTree = &getAnalysis<MachineDominatorTree>();
+ LoopInfo = &getAnalysis<MachineLoopInfo>();
- for (df_iterator<MachineBasicBlock *> I = ++df_begin(MBB),
- E = df_end(MBB);
- I != E; ++I) {
- for (const MachineInstr &MI : **I) {
- addDefs(MI, LiveOut);
+ // Scan the function to find the WWM sections and the candidate registers for
+ // having liveness modified.
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ if (MI.getOpcode() == AMDGPU::EXIT_WWM)
+ WWMs.push_back(&MI);
+ else {
+ for (MachineOperand &DefOpnd : MI.defs()) {
+ if (DefOpnd.isReg()) {
+ unsigned Reg = DefOpnd.getReg();
+ if (TRI->isVGPR(*MRI, Reg))
+ processDef(DefOpnd);
+ }
+ }
+ }
}
}
+ if (!WWMs.empty()) {
+ // Synthesize liveness over WWM sections as required.
+ for (auto ThenDef : ThenDefs)
+ Modified |= processThenDef(ThenDef);
+ for (auto LoopExitDef : LoopExitDefs)
+ Modified |= processLoopExitDef(LoopExitDef.first, LoopExitDef.second);
+ for (auto LoopPhiDef : LoopPhiDefs)
+ Modified |= processLoopPhiDef(LoopPhiDef.first, LoopPhiDef.second);
+ }
- // Compute the registers that reach MI.
- SparseBitVector<> Reachable;
+ WWMs.clear();
+ ThenDefs.clear();
+ LoopExitDefs.clear();
+ LoopPhiDefs.clear();
- for (auto II = ++MachineBasicBlock::reverse_iterator(WWM), IE =
- MBB->rend(); II != IE; ++II) {
- addDefs(*II, Reachable);
- }
+ return Modified;
+}
- for (idf_iterator<MachineBasicBlock *> I = ++idf_begin(MBB),
- E = idf_end(MBB);
- I != E; ++I) {
- for (const MachineInstr &MI : **I) {
- addDefs(MI, Reachable);
+// During the function scan, process an operand that defines a VGPR.
+// This categorizes the register and puts it in the appropriate list for later
+// use when processing a WWM section.
+void SIFixWWMLiveness::processDef(MachineOperand &DefOpnd) {
+ unsigned Reg = DefOpnd.getReg();
+ // Get all the defining instructions. For convenience, make Defs[0] the def
+ // we are on now.
+ SmallVector<const MachineInstr *, 4> Defs;
+ Defs.push_back(DefOpnd.getParent());
+ for (auto &MI : MRI->def_instructions(Reg)) {
+ if (&MI != DefOpnd.getParent())
+ Defs.push_back(&MI);
+ }
+ // Check whether this def dominates all the others. If not, ignore this def.
+ // Either it is going to be processed when the scan encounters its other def
+ // that dominates all defs, or there is no def that dominates all others.
+ // The latter case is an eliminated phi from an if..else..endif or similar,
+ // which must be for uniform control flow so can be ignored.
+ // Because this pass runs shortly after PHIElimination, we assume that any
+ // multi-def register is a lowered phi, and thus has each def in a separate
+ // basic block.
+ for (unsigned I = 1; I != Defs.size(); ++I) {
+ if (!DomTree->dominates(Defs[0]->getParent(), Defs[I]->getParent()))
+ return;
+ }
+ // Check for the case of an if..endif lowered phi: It has two defs, one
+ // dominates the other, and there is a single use in a successor of the
+ // dominant def.
+ // Later we will spot any WWM code inside
+ // the "then" clause and turn the second def into a partial def so its
+ // liveness goes through the WWM code in the "then" clause.
+ if (Defs.size() == 2) {
+ auto DomDefBlock = Defs[0]->getParent();
+ if (DomDefBlock->succ_size() == 2 && MRI->hasOneUse(Reg)) {
+ auto UseBlock = MRI->use_begin(Reg)->getParent()->getParent();
+ for (auto Succ : DomDefBlock->successors()) {
+ if (Succ == UseBlock) {
+ LLVM_DEBUG(dbgs() << printReg(Reg, TRI) << " is a then phi reg\n");
+ ThenDefs.push_back(&DefOpnd);
+ return;
+ }
+ }
}
}
-
- // find the intersection, and add implicit uses.
- LiveOut &= Reachable;
-
- bool Modified = false;
- for (unsigned Reg : LiveOut) {
- WWM.addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true));
- if (LIS) {
- // FIXME: is there a better way to update the live interval?
- LIS->removeInterval(Reg);
- LIS->createAndComputeVirtRegInterval(Reg);
+ // Check for the case of a non-lowered-phi register (single def) that exits
+ // a loop, that is, it has a use that is outside a loop that the def is
+ // inside. We find the outermost loop that the def is inside but a use is
+ // outside. Later we will spot any WWM code inside that loop and then make
+ // the def a partial def so its liveness goes round the loop and through the
+ // WWM code.
+ if (Defs.size() == 1) {
+ auto Loop = LoopInfo->getLoopFor(Defs[0]->getParent());
+ if (!Loop)
+ return;
+ bool IsLoopExit = false;
+ for (auto &Use : MRI->use_instructions(Reg)) {
+ auto UseBlock = Use.getParent();
+ if (Loop->contains(UseBlock))
+ continue;
+ IsLoopExit = true;
+ while (auto Parent = Loop->getParentLoop()) {
+ if (Parent->contains(UseBlock))
+ break;
+ Loop = Parent;
+ }
}
- Modified = true;
+ if (!IsLoopExit)
+ return;
+ LLVM_DEBUG(dbgs() << printReg(Reg, TRI)
+ << " is a loop exit reg with loop header at "
+ << "bb." << Loop->getHeader()->getNumber() << "\n");
+ LoopExitDefs.push_back(std::pair<MachineOperand *, MachineLoop *>(
+ &DefOpnd, Loop));
+ return;
}
-
- return Modified;
+ // Check for the case of a lowered single-preheader-loop phi, that is, a
+ // multi-def register where the dominating def is in the loop pre-header and
+ // all other defs are in backedges. Later we will spot any WWM code inside
+ // that loop and then make the backedge defs partial defs so the liveness
+ // goes through the WWM code.
+ // Note that we are ignoring multi-preheader loops on the basis that the
+ // structurizer does not allow that for non-uniform loops.
+ // There must be a single use in the loop header.
+ if (!MRI->hasOneUse(Reg))
+ return;
+ auto UseBlock = MRI->use_begin(Reg)->getParent()->getParent();
+ auto Loop = LoopInfo->getLoopFor(UseBlock);
+ if (!Loop || Loop->getHeader() != UseBlock
+ || Loop->contains(Defs[0]->getParent())) {
+ LLVM_DEBUG(dbgs() << printReg(Reg, TRI)
+ << " is multi-def but single use not in loop header\n");
+ return;
+ }
+ for (unsigned I = 1; I != Defs.size(); ++I) {
+ if (!Loop->contains(Defs[I]->getParent()))
+ return;
+ }
+ LLVM_DEBUG(dbgs() << printReg(Reg, TRI)
+ << " is a loop phi reg with loop header at "
+ << "bb." << Loop->getHeader()->getNumber() << "\n");
+ LoopPhiDefs.push_back(
+ std::pair<MachineOperand *, MachineLoop *>(&DefOpnd, Loop));
}
-bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) {
- bool Modified = false;
-
- // This doesn't actually need LiveIntervals, but we can preserve them.
- LIS = getAnalysisIfAvailable<LiveIntervals>();
-
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- const SIInstrInfo *TII = ST.getInstrInfo();
-
- TRI = &TII->getRegisterInfo();
- MRI = &MF.getRegInfo();
+// Process a then phi def: It has two defs, one dominates the other, and there
+// is a single use in a successor of the dominant def. Here we spot any WWM
+// code inside the "then" clause and turn the second def into a partial def so
+// its liveness goes through the WWM code in the "then" clause.
+bool SIFixWWMLiveness::processThenDef(MachineOperand *DefOpnd) {
+ LLVM_DEBUG(dbgs() << "Processing then def: " << *DefOpnd->getParent());
+ if (DefOpnd->getParent()->getOpcode() == TargetOpcode::IMPLICIT_DEF) {
+ // Ignore if dominating def is undef.
+ LLVM_DEBUG(dbgs() << " ignoring as dominating def is undef\n");
+ return false;
+ }
+ unsigned Reg = DefOpnd->getReg();
+ // Get the use block, which is the endif block.
+ auto UseBlock = MRI->use_instr_begin(Reg)->getParent();
+ // Check whether there is WWM code inside the then branch. The WWM code must
+ // be dominated by the if but not dominated by the endif.
+ bool ContainsWWM = false;
+ for (auto WWM : WWMs) {
+ if (DomTree->dominates(DefOpnd->getParent()->getParent(), WWM->getParent())
+ && !DomTree->dominates(UseBlock, WWM->getParent())) {
+ LLVM_DEBUG(dbgs() << " contains WWM: " << *WWM);
+ ContainsWWM = true;
+ break;
+ }
+ }
+ if (!ContainsWWM)
+ return false;
+ // Get the other def.
+ MachineInstr *OtherDef = nullptr;
+ for (auto &MI : MRI->def_instructions(Reg)) {
+ if (&MI != DefOpnd->getParent())
+ OtherDef = &MI;
+ }
+ // Make it a partial def.
+ OtherDef->addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true));
+ LLVM_DEBUG(dbgs() << *OtherDef);
+ return true;
+}
- for (MachineBasicBlock &MBB : MF) {
- for (MachineInstr &MI : MBB) {
- if (MI.getOpcode() == AMDGPU::EXIT_WWM) {
- Modified |= runOnWWMInstruction(MI);
- }
+// Process a loop exit def, that is, a register with a single use in a loop
+// that has a use outside the loop. Here we spot any WWM code inside that loop
+// and then make the def a partial def so its liveness goes round the loop and
+// through the WWM code.
+bool SIFixWWMLiveness::processLoopExitDef(MachineOperand *DefOpnd,
+ MachineLoop *Loop) {
+ LLVM_DEBUG(dbgs() << "Processing loop exit def: " << *DefOpnd->getParent());
+ // Check whether there is WWM code inside the loop.
+ bool ContainsWWM = false;
+ for (auto WWM : WWMs) {
+ if (Loop->contains(WWM->getParent())) {
+ LLVM_DEBUG(dbgs() << " contains WWM: " << *WWM);
+ ContainsWWM = true;
+ break;
}
}
+ if (!ContainsWWM)
+ return false;
+ unsigned Reg = DefOpnd->getReg();
+ // Add a new implicit_def in loop preheader(s).
+ for (auto Pred : Loop->getHeader()->predecessors()) {
+ if (!Loop->contains(Pred)) {
+ auto ImplicitDef = BuildMI(*Pred, Pred->getFirstTerminator(), DebugLoc(),
+ TII->get(TargetOpcode::IMPLICIT_DEF), Reg);
+ LLVM_DEBUG(dbgs() << *ImplicitDef);
+ (void)ImplicitDef;
+ }
+ }
+ // Make the original def partial.
+ DefOpnd->getParent()->addOperand(MachineOperand::CreateReg(
+ Reg, false, /*isImp=*/true));
+ LLVM_DEBUG(dbgs() << *DefOpnd->getParent());
+ return true;
+}
- return Modified;
+// Process a loop phi def, that is, a multi-def register where the dominating
+// def is in the loop pre-header and all other defs are in backedges. Here we
+// spot any WWM code inside that loop and then make the backedge defs partial
+// defs so the liveness goes through the WWM code.
+bool SIFixWWMLiveness::processLoopPhiDef(MachineOperand *DefOpnd,
+ MachineLoop *Loop) {
+ LLVM_DEBUG(dbgs() << "Processing loop phi def: " << *DefOpnd->getParent());
+ // Check whether there is WWM code inside the loop.
+ bool ContainsWWM = false;
+ for (auto WWM : WWMs) {
+ if (Loop->contains(WWM->getParent())) {
+ LLVM_DEBUG(dbgs() << " contains WWM: " << *WWM);
+ ContainsWWM = true;
+ break;
+ }
+ }
+ if (!ContainsWWM)
+ return false;
+ unsigned Reg = DefOpnd->getReg();
+ // Remove kill mark from uses.
+ for (auto &Use : MRI->use_operands(Reg))
+ Use.setIsKill(false);
+ // Make all defs except the dominating one partial defs.
+ SmallVector<MachineInstr *, 4> Defs;
+ for (auto &Def : MRI->def_instructions(Reg))
+ Defs.push_back(&Def);
+ for (auto Def : Defs) {
+ if (DefOpnd->getParent() == Def)
+ continue;
+ Def->addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true));
+ LLVM_DEBUG(dbgs() << *Def);
+ }
+ return true;
}
+
diff --git a/llvm/test/CodeGen/AMDGPU/fix-wwm-liveness.mir b/llvm/test/CodeGen/AMDGPU/fix-wwm-liveness.mir
index 0acf154ce12..8049d780646 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-wwm-liveness.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-wwm-liveness.mir
@@ -1,8 +1,11 @@
# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-fix-wwm-liveness -o - %s | FileCheck %s
-#CHECK: $exec = EXIT_WWM killed %19, implicit %21
+
+# Test a then phi value.
+#CHECK: test_wwm_liveness_then_phi
+#CHECK: %21:vgpr_32 = V_MOV_B32_e32 1, implicit $exec, implicit %21
---
-name: test_wwm_liveness
+name: test_wwm_liveness_then_phi
alignment: 0
exposesReturnsTwice: false
legalized: false
@@ -71,3 +74,112 @@ body: |
SI_RETURN_TO_EPILOG killed $vgpr0
...
+
+# Test a loop with a loop exit value and a loop phi.
+#CHECK: test_wwm_liveness_loop
+#CHECK: %4:vgpr_32 = IMPLICIT_DEF
+#CHECK: bb.1:
+#CHECK: %4:vgpr_32 = FLAT_LOAD_DWORD{{.*}}, implicit %4
+#CHECK: %27:vgpr_32 = COPY killed %21, implicit %27
+
+---
+name: test_wwm_liveness_loop
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vgpr_32, preferred-register: '' }
+ - { id: 1, class: sreg_32_xm0, preferred-register: '' }
+ - { id: 2, class: sreg_64, preferred-register: '' }
+ - { id: 3, class: sreg_32_xm0, preferred-register: '' }
+ - { id: 4, class: vgpr_32, preferred-register: '' }
+ - { id: 5, class: sreg_32_xm0, preferred-register: '' }
+ - { id: 6, class: sreg_64, preferred-register: '' }
+ - { id: 7, class: sreg_64, preferred-register: '' }
+ - { id: 8, class: sreg_64, preferred-register: '' }
+ - { id: 9, class: vreg_64, preferred-register: '' }
+ - { id: 10, class: vgpr_32, preferred-register: '' }
+ - { id: 11, class: vgpr_32, preferred-register: '' }
+ - { id: 12, class: vgpr_32, preferred-register: '' }
+ - { id: 13, class: sreg_64, preferred-register: '' }
+ - { id: 14, class: vreg_64, preferred-register: '' }
+ - { id: 15, class: sreg_32_xm0, preferred-register: '' }
+ - { id: 16, class: vgpr_32, preferred-register: '' }
+ - { id: 17, class: sreg_64, preferred-register: '$vcc' }
+ - { id: 18, class: vgpr_32, preferred-register: '' }
+ - { id: 19, class: vgpr_32, preferred-register: '' }
+ - { id: 20, class: vgpr_32, preferred-register: '' }
+ - { id: 21, class: vgpr_32, preferred-register: '' }
+ - { id: 22, class: vgpr_32, preferred-register: '' }
+ - { id: 23, class: sreg_64, preferred-register: '' }
+ - { id: 24, class: sreg_64, preferred-register: '' }
+ - { id: 25, class: sreg_64, preferred-register: '' }
+ - { id: 26, class: sreg_64, preferred-register: '' }
+ - { id: 27, class: vgpr_32, preferred-register: '' }
+liveins:
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ maxCallFrameSize: 4294967295
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ localFrameSize: 0
+ savePoint: ''
+ restorePoint: ''
+fixedStack:
+stack:
+constants:
+body: |
+ bb.0:
+ successors: %bb.1(0x80000000)
+
+ %25:sreg_64 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ %0:vgpr_32 = FLAT_LOAD_DWORD undef %9:vreg_64, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* undef`, addrspace 1)
+ $exec = EXIT_WWM killed %25
+ %12:vgpr_32 = V_MBCNT_LO_U32_B32_e64 -1, 0, implicit $exec
+ %7:sreg_64 = S_MOV_B64 0
+ %26:sreg_64 = COPY killed %7
+ %27:vgpr_32 = COPY killed %12
+
+ bb.1:
+ successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+
+ %24:sreg_64 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ %20:vgpr_32 = COPY killed %27
+ %2:sreg_64 = COPY killed %26
+ %4:vgpr_32 = FLAT_LOAD_DWORD undef %14:vreg_64, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* undef`, addrspace 1)
+ $exec = EXIT_WWM killed %24
+ %22:vgpr_32 = V_ADD_I32_e32 -1, killed %20, implicit-def dead $vcc, implicit $exec
+ %17:sreg_64 = V_CMP_EQ_U32_e64 0, %22, implicit $exec
+ %6:sreg_64 = S_OR_B64 killed %17, killed %2, implicit-def $scc
+ %21:vgpr_32 = COPY killed %22
+ %26:sreg_64 = COPY %6
+ %27:vgpr_32 = COPY killed %21
+ $exec = S_ANDN2_B64_term $exec, %6
+ S_CBRANCH_EXECNZ %bb.1, implicit $exec
+ S_BRANCH %bb.2
+
+ bb.2:
+ $exec = S_OR_B64 $exec, killed %6, implicit-def $scc
+ %23:sreg_64 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ %18:vgpr_32 = V_ADD_F32_e32 killed %0, killed %4, implicit $exec
+ $exec = EXIT_WWM killed %23
+ early-clobber %19:vgpr_32 = COPY killed %18, implicit $exec
+ $vgpr0 = COPY killed %19
+ SI_RETURN_TO_EPILOG killed $vgpr0
+
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 4c9a8d5a938..e94a4888521 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -260,8 +260,9 @@ main_body:
}
; Check that WWM is turned on correctly across basic block boundaries.
+; if..then..endif version
;
-;CHECK-LABEL: {{^}}test_wwm6:
+;CHECK-LABEL: {{^}}test_wwm6_then:
;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
;SI-CHECK: buffer_load_dword
;VI-CHECK: flat_load_dword
@@ -272,7 +273,7 @@ main_body:
;VI-CHECK: flat_load_dword
;CHECK: v_add_f32_e32
;CHECK: s_mov_b64 exec, [[ORIG2]]
-define amdgpu_ps float @test_wwm6() {
+define amdgpu_ps float @test_wwm6_then() {
main_body:
%src0 = load volatile float, float addrspace(1)* undef
; use mbcnt to make sure the branch is divergent
@@ -292,6 +293,40 @@ endif:
ret float %out.1
}
+; Check that WWM is turned on correctly across basic block boundaries.
+; loop version
+;
+;CHECK-LABEL: {{^}}test_wwm6_loop:
+;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
+;SI-CHECK: buffer_load_dword
+;VI-CHECK: flat_load_dword
+;CHECK: s_mov_b64 exec, [[ORIG]]
+;CHECK: %loop
+;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1
+;SI-CHECK: buffer_load_dword
+;VI-CHECK: flat_load_dword
+;CHECK: s_mov_b64 exec, [[ORIG2]]
+define amdgpu_ps float @test_wwm6_loop() {
+main_body:
+ %src0 = load volatile float, float addrspace(1)* undef
+ ; use mbcnt to make sure the branch is divergent
+ %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+ %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
+ br label %loop
+
+loop:
+ %counter = phi i32 [ %lo, %main_body ], [ %counter.1, %loop ]
+ %src1 = load volatile float, float addrspace(1)* undef
+ %out = fadd float %src0, %src1
+ %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
+ %counter.1 = sub i32 %counter, 1
+ %cc = icmp ne i32 %counter.1, 0
+ br i1 %cc, label %loop, label %endloop
+
+endloop:
+ ret float %out.0
+}
+
; Check that @llvm.amdgcn.set.inactive disables WWM.
;
;CHECK-LABEL: {{^}}test_set_inactive1:
OpenPOWER on IntegriCloud