[AMDGPU] Add support for Whole Wavefront Mode

Summary: Whole Wavefront Wode (WWM) is similar to WQM, except that all of the lanes are always enabled, regardless of control flow. This is required for implementing wavefront reductions in non-uniform control flow, where we need to use the inactive lanes to propagate intermediate results, so they need to be enabled. We need to propagate WWM to uses (unless they're explicitly marked as exact) so that they also propagate intermediate results correctly. We do the analysis and exec mask munging during the WQM pass, since there are interactions with WQM for things that require both WQM and WWM. For simplicity, WWM is entirely block-local -- blocks are never WWM on entry or exit of a block, and WWM is not propagated to the block level. This means that computations involving WWM cannot involve control flow, but we only ever plan to use WWM for a few limited purposes (none of which involve control flow) anyways. Shaders can ask for WWM using the @llvm.amdgcn.wwm intrinsic. There isn't yet a way to turn WWM off -- that will be added in a future change. Finally, it turns out that turning on inactive lanes causes a number of problems with register allocation. While the best long-term solution seems like teaching LLVM's register allocator about predication, for now we need to add some hacks to prevent ourselves from getting into trouble due to constraints that aren't currently expressed in LLVM. For the gory details, see the comments at the top of SIFixWWMLiveness.cpp. Reviewers: arsenm, nhaehnle, tpr Subscribers: kzhuravl, wdng, mgorny, yaxunl, dstuttard, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D35524 llvm-svn: 310087
author: Connor Abbott <cwabbott0@gmail.com> 2017-08-04 18:36:52 +0000
committer: Connor Abbott <cwabbott0@gmail.com> 2017-08-04 18:36:52 +0000
commit: 92638ab62559ce082c02680efca7280344ca05dd (patch)
tree: 972eca502c4db28254e18a1b7160eca5ae053c4b /llvm/lib/Target
parent: de068fe8b4d41dfd317f3cd54d0c439d0d4ea246 (diff)
download: bcm5719-llvm-92638ab62559ce082c02680efca7280344ca05dd.tar.gz
bcm5719-llvm-92638ab62559ce082c02680efca7280344ca05dd.zip
9 files changed, 383 insertions, 48 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index f832338fdb4..feb73f45011 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -50,6 +50,7 @@ FunctionPass *createSIMemoryLegalizerPass();
 FunctionPass *createSIDebuggerInsertNopsPass();
 FunctionPass *createSIInsertWaitsPass();
 FunctionPass *createSIInsertWaitcntsPass();
+FunctionPass *createSIFixWWMLivenessPass();
 FunctionPass *createAMDGPUCodeGenPreparePass();
 FunctionPass *createAMDGPUMachineCFGStructurizerPass();
 FunctionPass *createAMDGPURewriteOutArgumentsPass();
@@ -120,6 +121,9 @@ extern char &SIInsertSkipsPassID;
 void initializeSIOptimizeExecMaskingPass(PassRegistry &);
 extern char &SIOptimizeExecMaskingID;
 
+void initializeSIFixWWMLivenessPass(PassRegistry &);
+extern char &SIFixWWMLivenessID;
+
 // Passes common to R600 and SI
 FunctionPass *createAMDGPUPromoteAlloca();
 void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 3bd334c2082..7a8fd019593 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -168,6 +168,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
   initializeSIMemoryLegalizerPass(*PR);
   initializeSIDebuggerInsertNopsPass(*PR);
   initializeSIOptimizeExecMaskingPass(*PR);
+  initializeSIFixWWMLivenessPass(*PR);
   initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
   initializeAMDGPUAAWrapperPassPass(*PR);
 }
@@ -792,6 +793,10 @@ void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
   // SI_ELSE will introduce a copy of the tied operand source after the else.
   insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
 
+  // This must be run after SILowerControlFlow, since it needs to use the
+  // machine-level CFG, but before register allocation.
+  insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
+
   TargetPassConfig::addFastRegAlloc(RegAllocPass);
 }
 
@@ -808,6 +813,10 @@ void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
   // SI_ELSE will introduce a copy of the tied operand source after the else.
   insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
 
+  // This must be run after SILowerControlFlow, since it needs to use the
+  // machine-level CFG, but before register allocation.
+  insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
+
   TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
 }
 
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 58b6b4857dd..5de828a7b6b 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -69,6 +69,7 @@ add_llvm_target(AMDGPUCodeGen
   SIFixControlFlowLiveIntervals.cpp
   SIFixSGPRCopies.cpp
   SIFixVGPRCopies.cpp
+  SIFixWWMLiveness.cpp
   SIFoldOperands.cpp
   SIFrameLowering.cpp
   SIInsertSkips.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index ee2b415099b..86ad8df2a21 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -568,7 +568,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
       default:
         continue;
       case AMDGPU::COPY:
-      case AMDGPU::WQM: {
+      case AMDGPU::WQM:
+      case AMDGPU::WWM: {
         // If the destination register is a physical register there isn't really
         // much we can do to fix this.
         if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()))
diff --git a/llvm/lib/Target/AMDGPU/SIFixWWMLiveness.cpp b/llvm/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
new file mode 100644
index 00000000000..8b985637b08
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
@@ -0,0 +1,202 @@
+//===-- SIFixWWMLiveness.cpp - Fix WWM live intervals ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Computations in WWM can overwrite values in inactive channels for
+/// variables that the register allocator thinks are dead. This pass adds fake
+/// uses of those variables to WWM instructions to make sure that they aren't
+/// overwritten.
+///
+/// As an example, consider this snippet:
+/// %vgpr0 = V_MOV_B32_e32 0.0
+/// if (...) {
+///   %vgpr1 = ...
+///   %vgpr2 = WWM %vgpr1<kill>
+///   ... = %vgpr2<kill>
+///   %vgpr0 = V_MOV_B32_e32 1.0
+/// }
+/// ... = %vgpr0
+///
+/// The live intervals of %vgpr0 don't overlap with those of %vgpr1. Normally,
+/// we can safely allocate %vgpr0 and %vgpr1 in the same register, since
+/// writing %vgpr1 would only write to channels that would be clobbered by the
+/// second write to %vgpr0 anyways. But if %vgpr1 is written with WWM enabled,
+/// it would clobber even the inactive channels for which the if-condition is
+/// false, for which %vgpr0 is supposed to be 0. This pass adds an implicit use
+/// of %vgpr0 to the WWM instruction to make sure they aren't allocated to the
+/// same register.
+///
+/// In general, we need to figure out what registers might have their inactive
+/// channels which are eventually used accidentally clobbered by a WWM
+/// instruction. We approximate this using two conditions:
+///
+/// 1. A definition of the variable reaches the WWM instruction.
+/// 2. The variable would be live at the WWM instruction if all its defs were
+/// partial defs (i.e. considered as a use), ignoring normal uses.
+///
+/// If a register matches both conditions, then we add an implicit use of it to
+/// the WWM instruction. Condition #2 is the heart of the matter: every
+/// definition is really a partial definition, since every VALU instruction is
+/// implicitly predicated.  We can usually ignore this, but WWM forces us not
+/// to. Condition #1 prevents false positives if the variable is undefined at
+/// the WWM instruction anyways. This is overly conservative in certain cases,
+/// especially in uniform control flow, but this is a workaround anyways until
+/// LLVM gains the notion of predicated uses and definitions of variables.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-fix-wwm-liveness"
+
+namespace {
+
+class SIFixWWMLiveness : public MachineFunctionPass {
+private:
+  LiveIntervals *LIS = nullptr;
+  const SIRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+
+public:
+  static char ID;
+
+  SIFixWWMLiveness() : MachineFunctionPass(ID) {
+    initializeSIFixWWMLivenessPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  bool runOnWWMInstruction(MachineInstr &MI);
+
+  void addDefs(const MachineInstr &MI, SparseBitVector<> &set);
+
+  StringRef getPassName() const override { return "SI Fix WWM Liveness"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    // Should preserve the same set that TwoAddressInstructions does.
+    AU.addPreserved<SlotIndexes>();
+    AU.addPreserved<LiveIntervals>();
+    AU.addPreservedID(LiveVariablesID);
+    AU.addPreservedID(MachineLoopInfoID);
+    AU.addPreservedID(MachineDominatorsID);
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIFixWWMLiveness, DEBUG_TYPE,
+                "SI fix WWM liveness", false, false)
+
+char SIFixWWMLiveness::ID = 0;
+
+char &llvm::SIFixWWMLivenessID = SIFixWWMLiveness::ID;
+
+FunctionPass *llvm::createSIFixWWMLivenessPass() {
+  return new SIFixWWMLiveness();
+}
+
+void SIFixWWMLiveness::addDefs(const MachineInstr &MI, SparseBitVector<> &Regs)
+{
+  for (const MachineOperand &Op : MI.defs()) {
+    if (Op.isReg()) {
+      unsigned Reg = Op.getReg();
+      if (TRI->isVGPR(*MRI, Reg))
+        Regs.set(Reg);
+    }
+  }
+}
+
+bool SIFixWWMLiveness::runOnWWMInstruction(MachineInstr &WWM) {
+  MachineBasicBlock *MBB = WWM.getParent();
+
+  // Compute the registers that are live out of MI by figuring out which defs
+  // are reachable from MI.
+  SparseBitVector<> LiveOut;
+
+  for (auto II = MachineBasicBlock::iterator(WWM), IE =
+       MBB->end(); II != IE; ++II) {
+    addDefs(*II, LiveOut);
+  }
+
+  for (df_iterator<MachineBasicBlock *> I = ++df_begin(MBB),
+                                        E = df_end(MBB);
+       I != E; ++I) {
+    for (const MachineInstr &MI : **I) {
+      addDefs(MI, LiveOut);
+    }
+  }
+
+  // Compute the registers that reach MI.
+  SparseBitVector<> Reachable;
+
+  for (auto II = ++MachineBasicBlock::reverse_iterator(WWM), IE =
+       MBB->rend(); II != IE; ++II) {
+    addDefs(*II, Reachable);
+  }
+
+  for (idf_iterator<MachineBasicBlock *> I = ++idf_begin(MBB),
+                                         E = idf_end(MBB);
+       I != E; ++I) {
+    for (const MachineInstr &MI : **I) {
+      addDefs(MI, Reachable);
+    }
+  }
+
+  // find the intersection, and add implicit uses.
+  LiveOut &= Reachable;
+
+  bool Modified = false;
+  for (unsigned Reg : LiveOut) {
+    WWM.addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true));
+    if (LIS) {
+      // FIXME: is there a better way to update the live interval?
+      LIS->removeInterval(Reg);
+      LIS->createAndComputeVirtRegInterval(Reg);
+    }
+    Modified = true;
+  }
+
+  return Modified;
+}
+
+bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) {
+  bool Modified = false;
+
+  // This doesn't actually need LiveIntervals, but we can preserve them.
+  LIS = getAnalysisIfAvailable<LiveIntervals>();
+
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+
+  TRI = &TII->getRegisterInfo();
+  MRI = &MF.getRegInfo();
+
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (MI.getOpcode() == AMDGPU::EXIT_WWM) {
+        Modified |= runOnWWMInstruction(MI);
+      }
+    }
+  }
+
+  return Modified;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index fecad1e1646..f3036ef85ee 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3947,6 +3947,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
                    0);
   }
+  case Intrinsic::amdgcn_wwm: {
+    SDValue Src = Op.getOperand(1);
+    return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
+                   0);
+  }
   default:
     return Op;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index dc4b9998786..0b0d0388031 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1156,6 +1156,12 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.eraseFromParent();
     break;
   }
+  case AMDGPU::EXIT_WWM: {
+    // This only gets its own opcode so that SIFixWWMLiveness can tell when WWM
+    // is exited.
+    MI.setDesc(get(AMDGPU::S_MOV_B64));
+    break;
+  }
   }
   return true;
 }
@@ -2667,6 +2673,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
   case AMDGPU::PHI: return AMDGPU::PHI;
   case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
   case AMDGPU::WQM: return AMDGPU::WQM;
+  case AMDGPU::WWM: return AMDGPU::WWM;
   case AMDGPU::S_MOV_B32:
     return MI.getOperand(1).isReg() ?
            AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
@@ -3972,6 +3979,7 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
   case AMDGPU::REG_SEQUENCE:
   case AMDGPU::INSERT_SUBREG:
   case AMDGPU::WQM:
+  case AMDGPU::WWM:
     if (RI.hasVGPRs(NewDstRC))
       return nullptr;
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index f20ce4d4b28..a13c8f32fe6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -117,12 +117,26 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
 def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
                                       (ins VSrc_b64:$src0)>;
 
-// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy
-// after the WQM pass processes them.
+// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the
+// WQM pass processes it.
 def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
 
+// Pseudoinstruction for @llvm.amdgcn.wwm. It is turned into a copy post-RA, so
+// that the @earlyclobber is respected. The @earlyclobber is to make sure that
+// the instruction that defines $src0 (which is run in WWM) doesn't
+// accidentally clobber inactive channels of $vdst.
+let Constraints = "@earlyclobber $vdst" in {
+def WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
+}
+
 } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
 
+def EXIT_WWM : SPseudoInstSI <(outs SReg_64:$sdst), (ins SReg_64:$src0)> {
+  let hasSideEffects = 0;
+  let mayLoad = 0;
+  let mayStore = 0;
+}
+
 let usesCustomInserter = 1, SALU = 1 in {
 def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins),
   [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 6d91a7b0cf1..1a0f0f9aca9 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -9,7 +9,7 @@
 //
 /// \file
 /// \brief This pass adds instructions to enable whole quad mode for pixel
-/// shaders.
+/// shaders, and whole wavefront mode for all programs.
 ///
 /// Whole quad mode is required for derivative computations, but it interferes
 /// with shader side effects (stores and atomics). This pass is run on the
@@ -29,6 +29,13 @@
 ///   ...
 ///   S_MOV_B64 EXEC, Tmp
 ///
+/// We also compute when a sequence of instructions requires Whole Wavefront
+/// Mode (WWM) and insert instructions to save and restore it:
+///
+/// S_OR_SAVEEXEC_B64 Tmp, -1
+/// ...
+/// S_MOV_B64 EXEC, Tmp
+///
 /// In order to avoid excessive switching during sequences of Exact
 /// instructions, the pass first analyzes which instructions must be run in WQM
 /// (aka which instructions produce values that lead to derivative
@@ -85,7 +92,8 @@ namespace {
 
 enum {
   StateWQM = 0x1,
-  StateExact = 0x2,
+  StateWWM = 0x2,
+  StateExact = 0x4,
 };
 
 struct PrintState {
@@ -98,9 +106,14 @@ public:
 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
   if (PS.State & StateWQM)
     OS << "WQM";
-  if (PS.State & StateExact) {
+  if (PS.State & StateWWM) {
     if (PS.State & StateWQM)
       OS << '|';
+    OS << "WWM";
+  }
+  if (PS.State & StateExact) {
+    if (PS.State & (StateWQM | StateWWM))
+      OS << '|';
     OS << "Exact";
   }
 
@@ -130,6 +143,7 @@ struct WorkItem {
 
 class SIWholeQuadMode : public MachineFunctionPass {
 private:
+  CallingConv::ID CallingConv;
   const SIInstrInfo *TII;
   const SIRegisterInfo *TRI;
   MachineRegisterInfo *MRI;
@@ -163,6 +177,10 @@ private:
                unsigned SaveWQM, unsigned LiveMaskReg);
   void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
              unsigned SavedWQM);
+  void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
+             unsigned SaveOrig);
+  void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
+               unsigned SavedOrig);
   void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
 
   void lowerLiveMaskQueries(unsigned LiveMaskReg);
@@ -223,7 +241,7 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
                                       std::vector<WorkItem> &Worklist) {
   InstrInfo &II = Instructions[&MI];
 
-  assert(Flag == StateWQM);
+  assert(!(Flag & StateExact) && Flag != 0);
 
   // Remove any disabled states from the flag. The user that required it gets
   // an undefined value in the helper lanes. For example, this can happen if
@@ -243,7 +261,6 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
 /// Mark all instructions defining the uses in \p MI with \p Flag.
 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
                                           std::vector<WorkItem> &Worklist) {
-  assert(Flag == StateWQM);
   for (const MachineOperand &Use : MI.uses()) {
     if (!Use.isReg() || !Use.isUse())
       continue;
@@ -302,7 +319,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
       unsigned Opcode = MI.getOpcode();
       char Flags = 0;
 
-      if (TII->isDS(Opcode)) {
+      if (TII->isDS(Opcode) && CallingConv == CallingConv::AMDGPU_PS) {
         Flags = StateWQM;
       } else if (TII->isWQM(Opcode)) {
         // Sampling instructions don't need to produce results for all pixels
@@ -316,6 +333,14 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
         // correct, so we need it to be in WQM.
         Flags = StateWQM;
         LowerToCopyInstrs.push_back(&MI);
+      } else if (Opcode == AMDGPU::WWM) {
+        // The WWM intrinsic doesn't make the same guarantee, and plus it needs
+        // to be executed in WQM or Exact so that its copy doesn't clobber
+        // inactive lanes.
+        markInstructionUses(MI, StateWWM, Worklist);
+        GlobalFlags |= StateWWM;
+        LowerToCopyInstrs.push_back(&MI);
+        continue;
       } else if (TII->isDisableWQM(MI)) {
         BBI.Needs |= StateExact;
         if (!(BBI.InNeeds & StateExact)) {
@@ -323,7 +348,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
           Worklist.push_back(&MBB);
         }
         GlobalFlags |= StateExact;
-        III.Disabled = StateWQM;
+        III.Disabled = StateWQM | StateWWM;
         continue;
       } else {
         if (Opcode == AMDGPU::SI_PS_LIVE) {
@@ -383,7 +408,7 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
 
   // Propagate backwards within block
   if (MachineInstr *PrevMI = MI.getPrevNode()) {
-    char InNeeds = II.Needs | II.OutNeeds;
+    char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds;
     if (!PrevMI->isPHI()) {
       InstrInfo &PrevII = Instructions[PrevMI];
       if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
@@ -589,6 +614,29 @@ void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
   LIS->InsertMachineInstrInMaps(*MI);
 }
 
+void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator Before,
+                            unsigned SaveOrig) {
+  MachineInstr *MI;
+
+  assert(SaveOrig);
+  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_OR_SAVEEXEC_B64),
+               SaveOrig)
+           .addImm(-1);
+  LIS->InsertMachineInstrInMaps(*MI);
+}
+
+void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator Before,
+                              unsigned SavedOrig) {
+  MachineInstr *MI;
+
+  assert(SavedOrig);
+  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), AMDGPU::EXEC)
+           .addReg(SavedOrig);
+  LIS->InsertMachineInstrInMaps(*MI);
+}
+
 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
                                    bool isEntry) {
   auto BII = Blocks.find(&MBB);
@@ -597,45 +645,63 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
 
   const BlockInfo &BI = BII->second;
 
-  if (!(BI.InNeeds & StateWQM))
-    return;
-
   // This is a non-entry block that is WQM throughout, so no need to do
   // anything.
-  if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact)
+  if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact)
     return;
 
   DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n");
 
   unsigned SavedWQMReg = 0;
+  unsigned SavedNonWWMReg = 0;
   bool WQMFromExec = isEntry;
-  char State = isEntry ? StateExact : StateWQM;
+  char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
+  char NonWWMState = 0;
 
   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
   if (isEntry)
     ++II; // Skip the instruction that saves LiveMask
 
-  MachineBasicBlock::iterator First = IE;
+  // This stores the first instruction where it's safe to switch from WQM to
+  // Exact or vice versa.
+  MachineBasicBlock::iterator FirstWQM = IE;
+
+  // This stores the first instruction where it's safe to switch from WWM to
+  // Exact/WQM or to switch to WWM. It must always be the same as, or after,
+  // FirstWQM since if it's safe to switch to/from WWM, it must be safe to
+  // switch to/from WQM as well.
+  MachineBasicBlock::iterator FirstWWM = IE;
   for (;;) {
     MachineBasicBlock::iterator Next = II;
-    char Needs = StateExact | StateWQM;
+    char Needs = StateExact | StateWQM; // WWM is disabled by default
     char OutNeeds = 0;
 
-    if (First == IE)
-      First = II;
+    if (FirstWQM == IE)
+      FirstWQM = II;
+
+    if (FirstWWM == IE)
+      FirstWWM = II;
 
+    // First, figure out the allowed states (Needs) based on the propagated
+    // flags.
     if (II != IE) {
       MachineInstr &MI = *II;
 
       if (requiresCorrectState(MI)) {
         auto III = Instructions.find(&MI);
         if (III != Instructions.end()) {
-          if (III->second.Needs & StateWQM)
+          if (III->second.Needs & StateWWM)
+            Needs = StateWWM;
+          else if (III->second.Needs & StateWQM)
             Needs = StateWQM;
           else
             Needs &= ~III->second.Disabled;
           OutNeeds = III->second.OutNeeds;
         }
+      } else {
+        // If the instruction doesn't actually need a correct EXEC, then we can
+        // safely leave WWM enabled.
+        Needs = StateExact | StateWQM | StateWWM;
       }
 
       if (MI.isTerminator() && OutNeeds == StateExact)
@@ -655,35 +721,63 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
         Needs = StateWQM | StateExact;
     }
 
+    // Now, transition if necessary.
     if (!(Needs & State)) {
+      MachineBasicBlock::iterator First;
+      if (State == StateWWM || Needs == StateWWM) {
+        // We must switch to or from WWM
+        First = FirstWWM;
+      } else {
+        // We only need to switch to/from WQM, so we can use FirstWQM
+        First = FirstWQM;
+      }
+
       MachineBasicBlock::iterator Before =
           prepareInsertion(MBB, First, II, Needs == StateWQM,
                            Needs == StateExact || WQMFromExec);
 
-      if (Needs == StateExact) {
-        if (!WQMFromExec && (OutNeeds & StateWQM))
-          SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      if (State == StateWWM) {
+        assert(SavedNonWWMReg);
+        fromWWM(MBB, Before, SavedNonWWMReg);
+        State = NonWWMState;
+      }
 
-        toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
-        State = StateExact;
+      if (Needs == StateWWM) {
+        NonWWMState = State;
+        SavedNonWWMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+        toWWM(MBB, Before, SavedNonWWMReg);
+        State = StateWWM;
       } else {
-        assert(Needs == StateWQM);
-        assert(WQMFromExec == (SavedWQMReg == 0));
+        if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
+          if (!WQMFromExec && (OutNeeds & StateWQM))
+            SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
 
-        toWQM(MBB, Before, SavedWQMReg);
+          toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
+          State = StateExact;
+        } else if (State == StateExact && (Needs & StateWQM) &&
+                   !(Needs & StateExact)) {
+          assert(WQMFromExec == (SavedWQMReg == 0));
 
-        if (SavedWQMReg) {
-          LIS->createAndComputeVirtRegInterval(SavedWQMReg);
-          SavedWQMReg = 0;
+          toWQM(MBB, Before, SavedWQMReg);
+
+          if (SavedWQMReg) {
+            LIS->createAndComputeVirtRegInterval(SavedWQMReg);
+            SavedWQMReg = 0;
+          }
+          State = StateWQM;
+        } else {
+          // We can get here if we transitioned from WWM to a non-WWM state that
+          // already matches our needs, but we shouldn't need to do anything.
+          assert(Needs & State);
         }
-        State = StateWQM;
       }
-
-      First = IE;
     }
 
-    if (Needs != (StateExact | StateWQM))
-      First = IE;
+    if (Needs != (StateExact | StateWQM | StateWWM)) {
+      if (Needs != (StateExact | StateWQM))
+        FirstWQM = IE;
+      FirstWWM = IE;
+    }
 
     if (II == IE)
       break;
@@ -710,13 +804,11 @@ void SIWholeQuadMode::lowerCopyInstrs() {
 }
 
 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
-  if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS)
-    return false;
-
   Instructions.clear();
   Blocks.clear();
   LiveMaskQueries.clear();
   LowerToCopyInstrs.clear();
+  CallingConv = MF.getFunction()->getCallingConv();
 
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
 
@@ -726,14 +818,13 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   LIS = &getAnalysis<LiveIntervals>();
 
   char GlobalFlags = analyzeFunction(MF);
+  unsigned LiveMaskReg = 0;
   if (!(GlobalFlags & StateWQM)) {
     lowerLiveMaskQueries(AMDGPU::EXEC);
-    return !LiveMaskQueries.empty();
-  }
-
-  // Store a copy of the original live mask when required
-  unsigned LiveMaskReg = 0;
-  {
+    if (!(GlobalFlags & StateWWM))
+      return !LiveMaskQueries.empty();
+  } else {
+    // Store a copy of the original live mask when required
     MachineBasicBlock &Entry = MF.front();
     MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
 
@@ -745,13 +836,14 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
       LIS->InsertMachineInstrInMaps(*MI);
     }
 
+    lowerLiveMaskQueries(LiveMaskReg);
+
     if (GlobalFlags == StateWQM) {
       // For a shader that needs only WQM, we can just set it once.
       BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
               AMDGPU::EXEC)
           .addReg(AMDGPU::EXEC);
 
-      lowerLiveMaskQueries(LiveMaskReg);
       lowerCopyInstrs();
       // EntryMI may become invalid here
       return true;
@@ -760,7 +852,6 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
 
   DEBUG(printInfo());
 
-  lowerLiveMaskQueries(LiveMaskReg);
   lowerCopyInstrs();
 
   // Handle the general case
author	Connor Abbott <cwabbott0@gmail.com>	2017-08-04 18:36:52 +0000
committer	Connor Abbott <cwabbott0@gmail.com>	2017-08-04 18:36:52 +0000
commit	92638ab62559ce082c02680efca7280344ca05dd (patch)
tree	972eca502c4db28254e18a1b7160eca5ae053c4b /llvm/lib/Target
parent	de068fe8b4d41dfd317f3cd54d0c439d0d4ea246 (diff)
download	bcm5719-llvm-92638ab62559ce082c02680efca7280344ca05dd.tar.gz bcm5719-llvm-92638ab62559ce082c02680efca7280344ca05dd.zip