AMDGPU: Add SIWholeQuadMode pass

Summary: Whole quad mode is already enabled for pixel shaders that compute derivatives, but it must be suspended for instructions that cause a shader to have side effects (i.e. stores and atomics). This pass addresses the issue by storing the real (initial) live mask in a register, masking EXEC before instructions that require exact execution and (re-)enabling WQM where required. This pass is run before register coalescing so that we can use machine SSA for analysis. The changes in this patch expose a problem with the second machine scheduling pass: target independent instructions like COPY implicitly use EXEC when they operate on VGPRs, but this fact is not encoded in the MIR. This can lead to miscompilation because instructions are moved past changes to EXEC. This patch fixes the problem by adding use-implicit operands to target independent instructions. Some general codegen passes are relaxed to work with such implicit use operands. Reviewers: arsenm, tstellarAMD, mareko Subscribers: MatzeB, arsenm, llvm-commits Differential Revision: http://reviews.llvm.org/D18162 llvm-svn: 263982
author: Nicolai Haehnle <nhaehnle@gmail.com> 2016-03-21 20:28:33 +0000
committer: Nicolai Haehnle <nhaehnle@gmail.com> 2016-03-21 20:28:33 +0000
commit: 213e87f2ee2f5f5fa5704346d2f73df1d61a2f02 (patch)
tree: 8a56086de35da4d164e198a0b26e93122b564380 /llvm/lib
parent: b14f4fd0defd496c0c5ac6fc36aa15e9c7f450ce (diff)
download: bcm5719-llvm-213e87f2ee2f5f5fa5704346d2f73df1d61a2f02.tar.gz
bcm5719-llvm-213e87f2ee2f5f5fa5704346d2f73df1d61a2f02.zip
9 files changed, 515 insertions, 15 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 5a8fc4675bf..c1b66619d93 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -44,6 +44,7 @@ FunctionPass *createSIFoldOperandsPass();
 FunctionPass *createSILowerI1CopiesPass();
 FunctionPass *createSIShrinkInstructionsPass();
 FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
+FunctionPass *createSIWholeQuadModePass();
 FunctionPass *createSILowerControlFlowPass();
 FunctionPass *createSIFixControlFlowLiveIntervalsPass();
 FunctionPass *createSIFixSGPRCopiesPass();
@@ -70,6 +71,9 @@ extern char &SILowerI1CopiesID;
 void initializeSILoadStoreOptimizerPass(PassRegistry &);
 extern char &SILoadStoreOptimizerID;
 
+void initializeSIWholeQuadModePass(PassRegistry &);
+extern char &SIWholeQuadModeID;
+
 void initializeSILowerControlFlowPass(PassRegistry &);
 extern char &SILowerControlFlowPassID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index 79554a5f656..461dd99a4f5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -62,7 +62,6 @@ public:
                                int64_t Offset1, int64_t Offset2,
                                unsigned NumLoads) const override;
 
-
   /// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
   /// Return -1 if the target-specific opcode for the pseudo instruction does
   /// not exist. If Opcode is not a pseudo instruction, this is identity.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index dcd165cdd95..dce2a92de98 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -57,6 +57,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
   initializeSIAnnotateControlFlowPass(*PR);
   initializeSIInsertNopsPass(*PR);
   initializeSIInsertWaitsPass(*PR);
+  initializeSIWholeQuadModePass(*PR);
   initializeSILowerControlFlowPass(*PR);
 }
 
@@ -346,6 +347,7 @@ void GCNPassConfig::addPreRegAlloc() {
     insertPass(&MachineSchedulerID, &RegisterCoalescerID);
   }
   addPass(createSIShrinkInstructionsPass(), false);
+  addPass(createSIWholeQuadModePass());
 }
 
 void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 5bc97350f48..12c5633a372 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -63,6 +63,7 @@ add_llvm_target(AMDGPUCodeGen
   SIRegisterInfo.cpp
   SIShrinkInstructions.cpp
   SITypeRewriter.cpp
+  SIWholeQuadMode.cpp
   )
 
 add_subdirectory(AsmParser)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 577a234db35..f0b420d10a8 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1248,6 +1248,19 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
                  .addImm(0); // omod
 }
 
+bool SIInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
+                                       const MachineBasicBlock *MBB,
+                                       const MachineFunction &MF) const {
+  // Target-independent instructions do not have an implicit-use of EXEC, even
+  // when they operate on VGPRs. Treating EXEC modifications as scheduling
+  // boundaries prevents incorrect movements of such instructions.
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  if (MI->modifiesRegister(AMDGPU::EXEC, TRI))
+    return true;
+
+  return AMDGPUInstrInfo::isSchedulingBoundary(MI, MBB, MF);
+}
+
 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
   int64_t SVal = Imm.getSExtValue();
   if (SVal >= -16 && SVal <= 64)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 4b6ee47a5c1..2ed769de390 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -149,6 +149,10 @@ public:
                                       MachineBasicBlock::iterator &MI,
                                       LiveVariables *LV) const override;
 
+  bool isSchedulingBoundary(const MachineInstr *MI,
+                            const MachineBasicBlock *MBB,
+                            const MachineFunction &MF) const override;
+
   static bool isSALU(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::SALU;
   }
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index a804a5e6d32..c4a06ef2230 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -78,7 +78,7 @@ private:
   void SkipIfDead(MachineInstr &MI);
 
   void If(MachineInstr &MI);
-  void Else(MachineInstr &MI);
+  void Else(MachineInstr &MI, bool ExecModified);
   void Break(MachineInstr &MI);
   void IfBreak(MachineInstr &MI);
   void ElseBreak(MachineInstr &MI);
@@ -215,7 +215,7 @@ void SILowerControlFlow::If(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
-void SILowerControlFlow::Else(MachineInstr &MI) {
+void SILowerControlFlow::Else(MachineInstr &MI, bool ExecModified) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
   unsigned Dst = MI.getOperand(0).getReg();
@@ -225,6 +225,15 @@ void SILowerControlFlow::Else(MachineInstr &MI) {
           TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
           .addReg(Src); // Saved EXEC
 
+  if (ExecModified) {
+    // Adjust the saved exec to account for the modifications during the flow
+    // block that contains the ELSE. This can happen when WQM mode is switched
+    // off.
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
+            .addReg(AMDGPU::EXEC)
+            .addReg(Dst);
+  }
+
   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
           .addReg(AMDGPU::EXEC)
           .addReg(Dst);
@@ -488,7 +497,6 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   bool HaveKill = false;
-  bool NeedWQM = false;
   bool NeedFlat = false;
   unsigned Depth = 0;
 
@@ -498,17 +506,24 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
     MachineBasicBlock *EmptyMBBAtEnd = NULL;
     MachineBasicBlock &MBB = *BI;
     MachineBasicBlock::iterator I, Next;
+    bool ExecModified = false;
+
     for (I = MBB.begin(); I != MBB.end(); I = Next) {
       Next = std::next(I);
 
       MachineInstr &MI = *I;
-      if (TII->isWQM(MI) || TII->isDS(MI))
-        NeedWQM = true;
 
       // Flat uses m0 in case it needs to access LDS.
       if (TII->isFLAT(MI))
         NeedFlat = true;
 
+      for (const auto &Def : I->defs()) {
+        if (Def.isReg() && Def.isDef() && Def.getReg() == AMDGPU::EXEC) {
+          ExecModified = true;
+          break;
+        }
+      }
+
       switch (MI.getOpcode()) {
         default: break;
         case AMDGPU::SI_IF:
@@ -517,7 +532,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
           break;
 
         case AMDGPU::SI_ELSE:
-          Else(MI);
+          Else(MI, ExecModified);
           break;
 
         case AMDGPU::SI_BREAK:
@@ -599,12 +614,6 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
-  if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) {
-    MachineBasicBlock &MBB = MF.front();
-    BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
-            AMDGPU::EXEC).addReg(AMDGPU::EXEC);
-  }
-
   if (NeedFlat && MFI->IsKernel) {
     // TODO: What to use with function calls?
     // We will need to Initialize the flat scratch register pair.
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index a48d0932356..7c56f7e4473 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -72,9 +72,12 @@ public:
   }
 
   bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const {
+    const TargetRegisterClass *RC;
     if (TargetRegisterInfo::isVirtualRegister(Reg))
-      return isSGPRClass(MRI.getRegClass(Reg));
-    return getPhysRegClass(Reg);
+      RC = MRI.getRegClass(Reg);
+    else
+      RC = getPhysRegClass(Reg);
+    return isSGPRClass(RC);
   }
 
   /// \returns true if this class contains VGPR registers.
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
new file mode 100644
index 00000000000..31080e66435
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -0,0 +1,465 @@
+//===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This pass adds instructions to enable whole quad mode for pixel
+/// shaders.
+///
+/// Whole quad mode is required for derivative computations, but it interferes
+/// with shader side effects (stores and atomics). This pass is run on the
+/// scheduled machine IR but before register coalescing, so that machine SSA is
+/// available for analysis. It ensures that WQM is enabled when necessary, but
+/// disabled around stores and atomics.
+///
+/// When necessary, this pass creates a function prolog
+///
+///   S_MOV_B64 LiveMask, EXEC
+///   S_WQM_B64 EXEC, EXEC
+///
+/// to enter WQM at the top of the function and surrounds blocks of Exact
+/// instructions by
+///
+///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
+///   ...
+///   S_MOV_B64 EXEC, Tmp
+///
+/// In order to avoid excessive switching during sequences of Exact
+/// instructions, the pass first analyzes which instructions must be run in WQM
+/// (aka which instructions produce values that lead to derivative
+/// computations).
+///
+/// Basic blocks are always exited in WQM as long as some successor needs WQM.
+///
+/// There is room for improvement given better control flow analysis:
+///
+///  (1) at the top level (outside of control flow statements, and as long as
+///      kill hasn't been used), one SGPR can be saved by recovering WQM from
+///      the LiveMask (this is implemented for the entry block).
+///
+///  (2) when entire regions (e.g. if-else blocks or entire loops) only
+///      consist of exact and don't-care instructions, the switch only has to
+///      be done at the entry and exit points rather than potentially in each
+///      block of the region.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineDominanceFrontier.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-wqm"
+
+namespace {
+
+enum {
+  StateWQM = 0x1,
+  StateExact = 0x2,
+};
+
+struct InstrInfo {
+  char Needs = 0;
+  char OutNeeds = 0;
+};
+
+struct BlockInfo {
+  char Needs = 0;
+  char InNeeds = 0;
+  char OutNeeds = 0;
+};
+
+struct WorkItem {
+  const MachineBasicBlock *MBB = nullptr;
+  const MachineInstr *MI = nullptr;
+
+  WorkItem() {}
+  WorkItem(const MachineBasicBlock *MBB) : MBB(MBB) {}
+  WorkItem(const MachineInstr *MI) : MI(MI) {}
+};
+
+class SIWholeQuadMode : public MachineFunctionPass {
+private:
+  const SIInstrInfo *TII;
+  const SIRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+
+  DenseMap<const MachineInstr *, InstrInfo> Instructions;
+  DenseMap<const MachineBasicBlock *, BlockInfo> Blocks;
+  SmallVector<const MachineInstr *, 2> ExecExports;
+
+  char scanInstructions(const MachineFunction &MF, std::vector<WorkItem>& Worklist);
+  void propagateInstruction(const MachineInstr &MI, std::vector<WorkItem>& Worklist);
+  void propagateBlock(const MachineBasicBlock &MBB, std::vector<WorkItem>& Worklist);
+  char analyzeFunction(const MachineFunction &MF);
+
+  void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
+               unsigned SaveWQM, unsigned LiveMaskReg);
+  void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
+             unsigned SavedWQM);
+  void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
+
+public:
+  static char ID;
+
+  SIWholeQuadMode() :
+    MachineFunctionPass(ID) { }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "SI Whole Quad Mode";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace
+
+char SIWholeQuadMode::ID = 0;
+
+INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE,
+                      "SI Whole Quad Mode", false, false)
+INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE,
+                      "SI Whole Quad Mode", false, false)
+
+char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
+
+FunctionPass *llvm::createSIWholeQuadModePass() {
+  return new SIWholeQuadMode;
+}
+
+// Scan instructions to determine which ones require an Exact execmask and
+// which ones seed WQM requirements.
+char SIWholeQuadMode::scanInstructions(const MachineFunction &MF,
+                                       std::vector<WorkItem> &Worklist) {
+  char GlobalFlags = 0;
+
+  for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) {
+    const MachineBasicBlock &MBB = *BI;
+
+    for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
+      const MachineInstr &MI = *II;
+      unsigned Opcode = MI.getOpcode();
+      char Flags;
+
+      if (TII->isWQM(Opcode) || TII->isDS(Opcode)) {
+        Flags = StateWQM;
+      } else if (TII->get(Opcode).mayStore() &&
+                 (MI.getDesc().TSFlags & SIInstrFlags::VM_CNT)) {
+        Flags = StateExact;
+      } else {
+        // Handle export instructions with the exec mask valid flag set
+        if (Opcode == AMDGPU::EXP && MI.getOperand(4).getImm() != 0)
+          ExecExports.push_back(&MI);
+        continue;
+      }
+
+      Instructions[&MI].Needs = Flags;
+      Worklist.push_back(&MI);
+      GlobalFlags |= Flags;
+    }
+  }
+
+  return GlobalFlags;
+}
+
+void SIWholeQuadMode::propagateInstruction(const MachineInstr &MI,
+                                           std::vector<WorkItem>& Worklist) {
+  const MachineBasicBlock &MBB = *MI.getParent();
+  InstrInfo &II = Instructions[&MI];
+  BlockInfo &BI = Blocks[&MBB];
+
+  // Control flow-type instructions that are followed by WQM computations
+  // must themselves be in WQM.
+  if ((II.OutNeeds & StateWQM) && !(II.Needs & StateWQM) &&
+      (MI.isBranch() || MI.isTerminator() || MI.getOpcode() == AMDGPU::SI_KILL))
+    II.Needs = StateWQM;
+
+  // Propagate to block level
+  BI.Needs |= II.Needs;
+  if ((BI.InNeeds | II.Needs) != BI.InNeeds) {
+    BI.InNeeds |= II.Needs;
+    Worklist.push_back(&MBB);
+  }
+
+  // Propagate backwards within block
+  if (const MachineInstr *PrevMI = MI.getPrevNode()) {
+    char InNeeds = II.Needs | II.OutNeeds;
+    if (!PrevMI->isPHI()) {
+      InstrInfo &PrevII = Instructions[PrevMI];
+      if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
+        PrevII.OutNeeds |= InNeeds;
+        Worklist.push_back(PrevMI);
+      }
+    }
+  }
+
+  // Propagate WQM flag to instruction inputs
+  assert(II.Needs != (StateWQM | StateExact));
+  if (II.Needs != StateWQM)
+    return;
+
+  for (const MachineOperand &Use : MI.uses()) {
+    if (!Use.isReg() || !Use.isUse())
+      continue;
+
+    // At this point, physical registers appear as inputs or outputs
+    // and following them makes no sense (and would in fact be incorrect
+    // when the same VGPR is used as both an output and an input that leads
+    // to a NeedsWQM instruction).
+    //
+    // Note: VCC appears e.g. in 64-bit addition with carry - theoretically we
+    // have to trace this, in practice it happens for 64-bit computations like
+    // pointers where both dwords are followed already anyway.
+    if (!TargetRegisterInfo::isVirtualRegister(Use.getReg()))
+      continue;
+
+    for (const MachineOperand &Def : MRI->def_operands(Use.getReg())) {
+      const MachineInstr *DefMI = Def.getParent();
+      InstrInfo &DefII = Instructions[DefMI];
+
+      // Obviously skip if DefMI is already flagged as NeedWQM.
+      //
+      // The instruction might also be flagged as NeedExact. This happens when
+      // the result of an atomic is used in a WQM computation. In this case,
+      // the atomic must not run for helper pixels and the WQM result is
+      // undefined.
+      if (DefII.Needs != 0)
+        continue;
+
+      DefII.Needs = StateWQM;
+      Worklist.push_back(DefMI);
+    }
+  }
+}
+
+void SIWholeQuadMode::propagateBlock(const MachineBasicBlock &MBB,
+                                     std::vector<WorkItem>& Worklist) {
+  BlockInfo &BI = Blocks[&MBB];
+
+  // Propagate through instructions
+  if (!MBB.empty()) {
+    const MachineInstr *LastMI = &*MBB.rbegin();
+    InstrInfo &LastII = Instructions[LastMI];
+    if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
+      LastII.OutNeeds |= BI.OutNeeds;
+      Worklist.push_back(LastMI);
+    }
+  }
+
+  // Predecessor blocks must provide for our WQM/Exact needs.
+  for (const MachineBasicBlock *Pred : MBB.predecessors()) {
+    BlockInfo &PredBI = Blocks[Pred];
+    if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
+      continue;
+
+    PredBI.OutNeeds |= BI.InNeeds;
+    PredBI.InNeeds |= BI.InNeeds;
+    Worklist.push_back(Pred);
+  }
+
+  // All successors must be prepared to accept the same set of WQM/Exact
+  // data.
+  for (const MachineBasicBlock *Succ : MBB.successors()) {
+    BlockInfo &SuccBI = Blocks[Succ];
+    if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
+      continue;
+
+    SuccBI.InNeeds |= BI.OutNeeds;
+    Worklist.push_back(Succ);
+  }
+}
+
+char SIWholeQuadMode::analyzeFunction(const MachineFunction &MF) {
+  std::vector<WorkItem> Worklist;
+  char GlobalFlags = scanInstructions(MF, Worklist);
+
+  while (!Worklist.empty()) {
+    WorkItem WI = Worklist.back();
+    Worklist.pop_back();
+
+    if (WI.MI)
+      propagateInstruction(*WI.MI, Worklist);
+    else
+      propagateBlock(*WI.MBB, Worklist);
+  }
+
+  return GlobalFlags;
+}
+
+void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator Before,
+                              unsigned SaveWQM, unsigned LiveMaskReg)
+{
+  if (SaveWQM) {
+    BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
+            SaveWQM)
+        .addReg(LiveMaskReg);
+  } else {
+    BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64),
+            AMDGPU::EXEC)
+        .addReg(AMDGPU::EXEC)
+        .addReg(LiveMaskReg);
+  }
+}
+
+void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator Before,
+                            unsigned SavedWQM)
+{
+  if (SavedWQM) {
+    BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC)
+        .addReg(SavedWQM);
+  } else {
+    BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
+            AMDGPU::EXEC)
+        .addReg(AMDGPU::EXEC);
+  }
+}
+
+void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
+                                   bool isEntry) {
+  auto BII = Blocks.find(&MBB);
+  if (BII == Blocks.end())
+    return;
+
+  const BlockInfo &BI = BII->second;
+
+  if (!(BI.InNeeds & StateWQM))
+    return;
+
+  // This is a non-entry block that is WQM throughout, so no need to do
+  // anything.
+  if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact)
+    return;
+
+  unsigned SavedWQMReg = 0;
+  bool WQMFromExec = isEntry;
+  char State = isEntry ? StateExact : StateWQM;
+
+  auto II = MBB.getFirstNonPHI(), IE = MBB.end();
+  while (II != IE) {
+    MachineInstr &MI = *II;
+    ++II;
+
+    // Skip instructions that are not affected by EXEC
+    if (MI.getDesc().TSFlags & (SIInstrFlags::SALU | SIInstrFlags::SMRD) &&
+        !MI.isBranch() && !MI.isTerminator())
+      continue;
+
+    // Generic instructions such as COPY will either disappear by register
+    // coalescing or be lowered to SALU or VALU instructions.
+    if (TargetInstrInfo::isGenericOpcode(MI.getOpcode())) {
+      if (MI.getNumExplicitOperands() >= 1) {
+        const MachineOperand &Op = MI.getOperand(0);
+        if (Op.isReg()) {
+          if (TRI->isSGPRReg(*MRI, Op.getReg())) {
+            // SGPR instructions are not affected by EXEC
+            continue;
+          }
+        }
+      }
+    }
+
+    char Needs = 0;
+    char OutNeeds = 0;
+    auto InstrInfoIt = Instructions.find(&MI);
+    if (InstrInfoIt != Instructions.end()) {
+      Needs = InstrInfoIt->second.Needs;
+      OutNeeds = InstrInfoIt->second.OutNeeds;
+
+      // Make sure to switch to Exact mode before the end of the block when
+      // Exact and only Exact is needed further downstream.
+      if (OutNeeds == StateExact && (MI.isBranch() || MI.isTerminator())) {
+        assert(Needs == 0);
+        Needs = StateExact;
+      }
+    }
+
+    // State switching
+    if (Needs && State != Needs) {
+      if (Needs == StateExact) {
+        assert(!SavedWQMReg);
+
+        if (!WQMFromExec && (OutNeeds & StateWQM))
+          SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+        toExact(MBB, &MI, SavedWQMReg, LiveMaskReg);
+      } else {
+        assert(WQMFromExec == (SavedWQMReg == 0));
+        toWQM(MBB, &MI, SavedWQMReg);
+        SavedWQMReg = 0;
+      }
+
+      State = Needs;
+    }
+
+    if (MI.getOpcode() == AMDGPU::SI_KILL)
+      WQMFromExec = false;
+  }
+
+  if ((BI.OutNeeds & StateWQM) && State != StateWQM) {
+    assert(WQMFromExec == (SavedWQMReg == 0));
+    toWQM(MBB, MBB.end(), SavedWQMReg);
+  } else if (BI.OutNeeds == StateExact && State != StateExact) {
+    toExact(MBB, MBB.end(), 0, LiveMaskReg);
+  }
+}
+
+bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  if (MFI->getShaderType() != ShaderType::PIXEL)
+    return false;
+
+  Instructions.clear();
+  Blocks.clear();
+  ExecExports.clear();
+
+  TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  TRI = static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
+  MRI = &MF.getRegInfo();
+
+  char GlobalFlags = analyzeFunction(MF);
+  if (!(GlobalFlags & StateWQM))
+    return false;
+
+  MachineBasicBlock &Entry = MF.front();
+  MachineInstr *EntryMI = Entry.getFirstNonPHI();
+
+  if (GlobalFlags == StateWQM) {
+    // For a shader that needs only WQM, we can just set it once.
+    BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
+            AMDGPU::EXEC).addReg(AMDGPU::EXEC);
+    return true;
+  }
+
+  // Handle the general case
+  unsigned LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
+      .addReg(AMDGPU::EXEC);
+
+  for (const auto &BII : Blocks)
+    processBlock(const_cast<MachineBasicBlock &>(*BII.first), LiveMaskReg,
+                 BII.first == &*MF.begin());
+
+  return true;
+}
author	Nicolai Haehnle <nhaehnle@gmail.com>	2016-03-21 20:28:33 +0000
committer	Nicolai Haehnle <nhaehnle@gmail.com>	2016-03-21 20:28:33 +0000
commit	213e87f2ee2f5f5fa5704346d2f73df1d61a2f02 (patch)
tree	8a56086de35da4d164e198a0b26e93122b564380 /llvm/lib
parent	b14f4fd0defd496c0c5ac6fc36aa15e9c7f450ce (diff)
download	bcm5719-llvm-213e87f2ee2f5f5fa5704346d2f73df1d61a2f02.tar.gz bcm5719-llvm-213e87f2ee2f5f5fa5704346d2f73df1d61a2f02.zip