AMDGPU: Reduce the duration of whole-quad-mode

Summary: This contains two changes that reduce the time spent in WQM, with the intention of reducing bandwidth required by VMEM loads: 1. Sampling instructions by themselves don't need to run in WQM, only their coordinate inputs need it (unless of course there is a dependent sampling instruction). The initial scanInstructions step is modified accordingly. 2. When switching back from WQM to Exact, switch back as soon as possible. This affects the logic in processBlock. This should always be a win or at best neutral. There are also some cleanups (e.g. remove unused ExecExports) and some new debugging output. Reviewers: arsenm, tstellarAMD, mareko Subscribers: arsenm, llvm-commits, kzhuravl Differential Revision: http://reviews.llvm.org/D22092 llvm-svn: 280590
author: Nicolai Haehnle <nhaehnle@gmail.com> 2016-09-03 12:26:38 +0000
committer: Nicolai Haehnle <nhaehnle@gmail.com> 2016-09-03 12:26:38 +0000
commit: 3bba6a843809d4615d694703ab569f6c1be3c936 (patch)
tree: 1cf546a3cdbe31eb2b1a117a71376ff9ea322b81 /llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
parent: a246dccc26667ab8bf6b6b427d1384bb65828509 (diff)
download: bcm5719-llvm-3bba6a843809d4615d694703ab569f6c1be3c936.tar.gz
bcm5719-llvm-3bba6a843809d4615d694703ab569f6c1be3c936.zip
1 files changed, 108 insertions, 54 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 03efd1f3326..ecf1d5abb9f 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -69,6 +69,25 @@ enum {
   StateExact = 0x2,
 };
 
+struct PrintState {
+public:
+  explicit PrintState(int State) : State(State) {}
+
+  int State;
+};
+
+static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
+  if (PS.State & StateWQM)
+    OS << "WQM";
+  if (PS.State & StateExact) {
+    if (PS.State & StateWQM)
+      OS << '|';
+    OS << "Exact";
+  }
+
+  return OS;
+}
+
 struct InstrInfo {
   char Needs = 0;
   char OutNeeds = 0;
@@ -98,11 +117,13 @@ private:
 
   DenseMap<const MachineInstr *, InstrInfo> Instructions;
   DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
-  SmallVector<const MachineInstr *, 2> ExecExports;
   SmallVector<MachineInstr *, 1> LiveMaskQueries;
 
+  void printInfo();
+
   void markInstruction(MachineInstr &MI, char Flag,
                        std::vector<WorkItem> &Worklist);
+  void markUsesWQM(const MachineInstr &MI, std::vector<WorkItem> &Worklist);
   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
@@ -151,6 +172,24 @@ FunctionPass *llvm::createSIWholeQuadModePass() {
   return new SIWholeQuadMode;
 }
 
+void SIWholeQuadMode::printInfo() {
+  for (const auto &BII : Blocks) {
+    dbgs() << "\nBB#" << BII.first->getNumber() << ":\n"
+           << "  InNeeds = " << PrintState(BII.second.InNeeds)
+           << ", Needs = " << PrintState(BII.second.Needs)
+           << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
+
+    for (const MachineInstr &MI : *BII.first) {
+      auto III = Instructions.find(&MI);
+      if (III == Instructions.end())
+        continue;
+
+      dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
+             << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
+    }
+  }
+}
+
 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
                                       std::vector<WorkItem> &Worklist) {
   InstrInfo &II = Instructions[&MI];
@@ -168,6 +207,45 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
   Worklist.push_back(&MI);
 }
 
+/// Mark all instructions defining the uses in \p MI as WQM.
+void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI,
+                                  std::vector<WorkItem> &Worklist) {
+  for (const MachineOperand &Use : MI.uses()) {
+    if (!Use.isReg() || !Use.isUse())
+      continue;
+
+    unsigned Reg = Use.getReg();
+
+    // Handle physical registers that we need to track; this is mostly relevant
+    // for VCC, which can appear as the (implicit) input of a uniform branch,
+    // e.g. when a loop counter is stored in a VGPR.
+    if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (Reg == AMDGPU::EXEC)
+        continue;
+
+      for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
+        LiveRange &LR = LIS->getRegUnit(*RegUnit);
+        const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
+        if (!Value)
+          continue;
+
+        // Since we're in machine SSA, we do not need to track physical
+        // registers across basic blocks.
+        if (Value->isPHIDef())
+          continue;
+
+        markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM,
+                        Worklist);
+      }
+
+      continue;
+    }
+
+    for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
+      markInstruction(DefMI, StateWQM, Worklist);
+  }
+}
+
 // Scan instructions to determine which ones require an Exact execmask and
 // which ones seed WQM requirements.
 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
@@ -183,16 +261,19 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
       unsigned Opcode = MI.getOpcode();
       char Flags = 0;
 
-      if (TII->isWQM(Opcode) || TII->isDS(Opcode)) {
+      if (TII->isDS(Opcode)) {
         Flags = StateWQM;
+      } else if (TII->isWQM(Opcode)) {
+        // Sampling instructions don't need to produce results for all pixels
+        // in a quad, they just require all inputs of a quad to have been
+        // computed for derivatives.
+        markUsesWQM(MI, Worklist);
+        GlobalFlags |= StateWQM;
+        continue;
       } else if (TII->isDisableWQM(MI)) {
         Flags = StateExact;
       } else {
-        // Handle export instructions with the exec mask valid flag set
-        if (Opcode == AMDGPU::EXP) {
-          if (MI.getOperand(4).getImm() != 0)
-            ExecExports.push_back(&MI);
-        } else if (Opcode == AMDGPU::SI_PS_LIVE) {
+        if (Opcode == AMDGPU::SI_PS_LIVE) {
           LiveMaskQueries.push_back(&MI);
         } else if (WQMOutputs) {
           // The function is in machine SSA form, which means that physical
@@ -259,43 +340,9 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
 
   // Propagate WQM flag to instruction inputs
   assert(II.Needs != (StateWQM | StateExact));
-  if (II.Needs != StateWQM)
-    return;
-
-  for (const MachineOperand &Use : MI.uses()) {
-    if (!Use.isReg() || !Use.isUse())
-      continue;
-
-    unsigned Reg = Use.getReg();
-
-    // Handle physical registers that we need to track; this is mostly relevant
-    // for VCC, which can appear as the (implicit) input of a uniform branch,
-    // e.g. when a loop counter is stored in a VGPR.
-    if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
-      if (Reg == AMDGPU::EXEC)
-        continue;
-
-      for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
-        LiveRange &LR = LIS->getRegUnit(*RegUnit);
-        const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
-        if (!Value)
-          continue;
-
-        // Since we're in machine SSA, we do not need to track physical
-        // registers across basic blocks.
-        if (Value->isPHIDef())
-          continue;
-
-        markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM,
-                        Worklist);
-      }
 
-      continue;
-    }
-
-    for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
-      markInstruction(DefMI, StateWQM, Worklist);
-  }
+  if (II.Needs == StateWQM)
+    markUsesWQM(MI, Worklist);
 }
 
 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
@@ -395,9 +442,12 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
   if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact)
     return;
 
+  DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n");
+
   unsigned SavedWQMReg = 0;
   bool WQMFromExec = isEntry;
   char State = isEntry ? StateExact : StateWQM;
+  MachineInstr *FirstNonWQM = nullptr;
 
   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
   while (II != IE) {
@@ -428,15 +478,16 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
     if (InstrInfoIt != Instructions.end()) {
       Needs = InstrInfoIt->second.Needs;
       OutNeeds = InstrInfoIt->second.OutNeeds;
-
-      // Make sure to switch to Exact mode before the end of the block when
-      // Exact and only Exact is needed further downstream.
-      if (OutNeeds == StateExact && MI.isTerminator()) {
-        assert(Needs == 0);
-        Needs = StateExact;
-      }
     }
 
+    // Keep track of the first consecutive non-WQM instruction, so that we
+    // switch away from WQM as soon as possible, potentially saving a small
+    // bit of bandwidth on loads.
+    if (Needs == StateWQM)
+      FirstNonWQM = nullptr;
+    else if (!FirstNonWQM)
+      FirstNonWQM = &MI;
+
     // State switching
     if (Needs && State != Needs) {
       if (Needs == StateExact) {
@@ -445,7 +496,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
         if (!WQMFromExec && (OutNeeds & StateWQM))
           SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
 
-        toExact(MBB, &MI, SavedWQMReg, LiveMaskReg);
+        toExact(MBB, FirstNonWQM, SavedWQMReg, LiveMaskReg);
       } else {
         assert(WQMFromExec == (SavedWQMReg == 0));
         toWQM(MBB, &MI, SavedWQMReg);
@@ -455,7 +506,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
       State = Needs;
     }
 
-    if (MI.getOpcode() == AMDGPU::SI_ELSE && State == StateExact)
+    if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
       MI.getOperand(3).setImm(1);
   }
 
@@ -463,7 +514,9 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
     assert(WQMFromExec == (SavedWQMReg == 0));
     toWQM(MBB, MBB.end(), SavedWQMReg);
   } else if (BI.OutNeeds == StateExact && State != StateExact) {
-    toExact(MBB, MBB.end(), 0, LiveMaskReg);
+    toExact(MBB, FirstNonWQM ? MachineBasicBlock::iterator(FirstNonWQM)
+                             : MBB.getFirstTerminator(),
+            0, LiveMaskReg);
   }
 }
 
@@ -483,7 +536,6 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
 
   Instructions.clear();
   Blocks.clear();
-  ExecExports.clear();
   LiveMaskQueries.clear();
 
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
@@ -523,6 +575,8 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
+  DEBUG(printInfo());
+
   lowerLiveMaskQueries(LiveMaskReg);
 
   // Handle the general case
author	Nicolai Haehnle <nhaehnle@gmail.com>	2016-09-03 12:26:38 +0000
committer	Nicolai Haehnle <nhaehnle@gmail.com>	2016-09-03 12:26:38 +0000
commit	3bba6a843809d4615d694703ab569f6c1be3c936 (patch)
tree	1cf546a3cdbe31eb2b1a117a71376ff9ea322b81 /llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
parent	a246dccc26667ab8bf6b6b427d1384bb65828509 (diff)
download	bcm5719-llvm-3bba6a843809d4615d694703ab569f6c1be3c936.tar.gz bcm5719-llvm-3bba6a843809d4615d694703ab569f6c1be3c936.zip