[AMDGPU] Fix DPP combiner check for exec modification

Summary: r363675 changed the exec modification helper function, now called execMayBeModifiedBeforeUse, so that if no UseMI is specified it checks all instructions in the basic block, even beyond the last use. That meant that the DPP combiner no longer worked in any basic block that ended with a control flow instruction, and in particular it didn't work on code sequences generated by the atomic optimizer. Fix it by reinstating the old behaviour but in a new helper function execMayBeModifiedBeforeAnyUse, and limiting the number of instructions scanned. Reviewers: arsenm, vpykhtin Subscribers: kzhuravl, nemanjai, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, kbarton, MaskRay, jfb, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D64393 llvm-svn: 365910
author: Jay Foad <jay.foad@gmail.com> 2019-07-12 15:59:40 +0000
committer: Jay Foad <jay.foad@gmail.com> 2019-07-12 15:59:40 +0000
commit: 27ec195f391cf1bcf39fe25f345bc63061566c3c (patch)
tree: e0f66473063bdf2ced10affcf6292cc96cc49b34
parent: f625a8a250b393b29a277ca790f6bc2fce7aa192 (diff)
download: bcm5719-llvm-27ec195f391cf1bcf39fe25f345bc63061566c3c.tar.gz
bcm5719-llvm-27ec195f391cf1bcf39fe25f345bc63061566c3c.zip
10 files changed, 115 insertions, 31 deletions
diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index 7348b5b56c8..e1845e2e8e8 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -344,7 +344,7 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
   auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
   assert(DstOpnd && DstOpnd->isReg());
   auto DPPMovReg = DstOpnd->getReg();
-  if (execMayBeModifiedBeforeUse(*MRI, DPPMovReg, MovMI)) {
+  if (execMayBeModifiedBeforeAnyUse(*MRI, DPPMovReg, MovMI)) {
     LLVM_DEBUG(dbgs() << "  failed: EXEC mask should remain the same"
                          " for all uses\n");
     return false;
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index bcc3478c67b..74d77d32801 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -650,7 +650,7 @@ void SIFoldOperands::foldOperand(
         if (execMayBeModifiedBeforeUse(*MRI,
                                        UseMI->getOperand(UseOpIdx).getReg(),
                                        *OpToFold.getParent(),
-                                       UseMI))
+                                       *UseMI))
           return;
 
         UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
@@ -669,7 +669,7 @@ void SIFoldOperands::foldOperand(
         if (execMayBeModifiedBeforeUse(*MRI,
                                        UseMI->getOperand(UseOpIdx).getReg(),
                                        *OpToFold.getParent(),
-                                       UseMI))
+                                       *UseMI))
           return;
 
         // %vgpr = COPY %sgpr0
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d855f3f0e42..34741850f82 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6269,47 +6269,75 @@ MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
 }
 
 bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
-                                      unsigned VReg,
+                                      Register VReg,
                                       const MachineInstr &DefMI,
-                                      const MachineInstr *UseMI) {
+                                      const MachineInstr &UseMI) {
   assert(MRI.isSSA() && "Must be run on SSA");
 
   auto *TRI = MRI.getTargetRegisterInfo();
   auto *DefBB = DefMI.getParent();
 
-  if (UseMI) {
+  // Don't bother searching between blocks, although it is possible this block
+  // doesn't modify exec.
+  if (UseMI.getParent() != DefBB)
+    return true;
+
+  const int MaxInstScan = 20;
+  int NumInst = 0;
+
+  // Stop scan at the use.
+  auto E = UseMI.getIterator();
+  for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
+    if (I->isDebugInstr())
+      continue;
+
+    if (++NumInst > MaxInstScan)
+      return true;
+
+    if (I->modifiesRegister(AMDGPU::EXEC, TRI))
+      return true;
+  }
+
+  return false;
+}
+
+bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
+                                         Register VReg,
+                                         const MachineInstr &DefMI) {
+  assert(MRI.isSSA() && "Must be run on SSA");
+
+  auto *TRI = MRI.getTargetRegisterInfo();
+  auto *DefBB = DefMI.getParent();
+
+  const int MaxUseInstScan = 10;
+  int NumUseInst = 0;
+
+  for (auto &UseInst : MRI.use_nodbg_instructions(VReg)) {
     // Don't bother searching between blocks, although it is possible this block
     // doesn't modify exec.
-    if (UseMI->getParent() != DefBB)
+    if (UseInst.getParent() != DefBB)
       return true;
-  } else {
-    int NumUse = 0;
-    const int MaxUseScan = 10;
-
-    for (auto &UseInst : MRI.use_nodbg_instructions(VReg)) {
-      if (UseInst.getParent() != DefBB)
-        return true;
 
-      if (NumUse++ > MaxUseScan)
-        return true;
-    }
+    if (++NumUseInst > MaxUseInstScan)
+      return true;
   }
 
   const int MaxInstScan = 20;
-  int NumScan = 0;
+  int NumInst = 0;
 
-  // Stop scan at the use if known.
-  auto E = UseMI ? UseMI->getIterator() : DefBB->end();
-  for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
+  // Stop scan when we have seen all the uses.
+  for (auto I = std::next(DefMI.getIterator()); ; ++I) {
     if (I->isDebugInstr())
       continue;
 
-    if (NumScan++ > MaxInstScan)
+    if (++NumInst > MaxInstScan)
       return true;
 
+    if (I->readsRegister(VReg))
+      if (--NumUseInst == 0)
+        return false;
+
     if (I->modifiesRegister(AMDGPU::EXEC, TRI))
       return true;
   }
-
-  return false;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index b8275b62272..1f3c659f9d9 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1016,13 +1016,19 @@ MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
                                MachineRegisterInfo &MRI);
 
 /// \brief Return false if EXEC is not changed between the def of \p VReg at \p
-/// DefMI and uses. If \p UseMI is not specified, this checks all uses of \p
-/// VReg. Should be run on SSA. Currently does not attempt to track between
-/// blocks.
+/// DefMI and the use at \p UseMI. Should be run on SSA. Currently does not
+/// attempt to track between blocks.
 bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
-                                unsigned VReg,
+                                Register VReg,
                                 const MachineInstr &DefMI,
-                                const MachineInstr *UseMI = nullptr);
+                                const MachineInstr &UseMI);
+
+/// \brief Return false if EXEC is not changed between the def of \p VReg at \p
+/// DefMI and all its uses. Should be run on SSA. Currently does not attempt to
+/// track between blocks.
+bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
+                                   Register VReg,
+                                   const MachineInstr &DefMI);
 
 namespace AMDGPU {
 
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 969eb3cfa7a..6f52a5f06a3 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 
@@ -42,6 +42,8 @@ entry:
 ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
 ; GFX7LESS-NOT: s_bcnt1_i32_b64
 ; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
+; GFX8MORE: v_add_u32_dpp
+; GFX8MORE: v_add_u32_dpp
 ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GFX8MORE: buffer_atomic_add v[[value]]
@@ -133,6 +135,8 @@ entry:
 ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
 ; GFX7LESS-NOT: s_bcnt1_i32_b64
 ; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
+; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
+; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
 ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GFX8MORE: buffer_atomic_sub v[[value]]
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 3ce91e83cf3..f3d50c9c490 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -45,6 +45,8 @@ entry:
 ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
 ; GFX7LESS-NOT: s_bcnt1_i32_b64
 ; GFX7LESS: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX8MORE: v_add_u32_dpp
+; GFX8MORE: v_add_u32_dpp
 ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GFX8MORE: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
@@ -136,6 +138,8 @@ entry:
 ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
 ; GFX7LESS-NOT: s_bcnt1_i32_b64
 ; GFX7LESS: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
+; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
 ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GFX8MORE: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
index c1ce93242ee..a170f96e4fe 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -39,6 +39,8 @@ else:
 ; GFX8MORE: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
 ; GFX8MORE: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
 ; GFX8MORE: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
+; GFX8MORE: v_add_u32_dpp
+; GFX8MORE: v_add_u32_dpp
 ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
 ; GFX8MORE: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index c2db5547201..dd813df3281 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -44,6 +44,8 @@ entry:
 ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
 ; GFX7LESS-NOT: s_bcnt1_i32_b64
 ; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
+; GFX8MORE: v_add_u32_dpp
+; GFX8MORE: v_add_u32_dpp
 ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GFX8MORE: buffer_atomic_add v[[value]]
@@ -104,6 +106,8 @@ entry:
 ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
 ; GFX7LESS-NOT: s_bcnt1_i32_b64
 ; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
+; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
+; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
 ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GFX8MORE: buffer_atomic_sub v[[value]]
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index eb3f0ab17ac..c32a73ef2b8 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -44,6 +44,8 @@ entry:
 ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
 ; GFX7LESS-NOT: s_bcnt1_i32_b64
 ; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
+; GFX8MORE: v_add_u32_dpp
+; GFX8MORE: v_add_u32_dpp
 ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GFX8MORE: buffer_atomic_add v[[value]]
@@ -117,6 +119,8 @@ entry:
 ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
 ; GFX7LESS-NOT: s_bcnt1_i32_b64
 ; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
+; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
+; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
 ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GFX8MORE: buffer_atomic_sub v[[value]]
diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine.mir
index d98cde5cff4..c43fb037d07 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp_combine.mir
+++ b/llvm/test/CodeGen/AMDGPU/dpp_combine.mir
@@ -380,6 +380,38 @@ body: |
     %9:vgpr_32 = V_SUB_I32_e32 5, %7, implicit-def $vcc, implicit $exec
 ...
 
+# tests on sequences of dpp consumers followed by control flow
+# CHECK-LABEL: name: dpp_seq_cf
+# CHECK: %4:vgpr_32 = V_ADD_I32_dpp %1, %0, %1, 1, 14, 15, 0, implicit-def $vcc, implicit $exec
+# CHECK: %5:vgpr_32 = V_SUBREV_I32_dpp %1, %0, %1, 1, 14, 15, 0, implicit-def $vcc, implicit $exec
+# CHECK: %6:vgpr_32 = V_OR_B32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec
+
+name: dpp_seq_cf
+tracksRegLiveness: true
+body: |
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: $vgpr0, $vgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+
+    %3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 0, implicit $exec
+    %4:vgpr_32 = V_ADD_I32_e32 %3, %1, implicit-def $vcc, implicit $exec
+    %5:vgpr_32 = V_SUB_I32_e32 %1, %3, implicit-def $vcc, implicit $exec
+    %6:vgpr_32 = V_OR_B32_e32 %3, %1, implicit $exec
+
+    %7:sreg_64 = V_CMP_EQ_U32_e64 %5, %6, implicit $exec
+    %8:sreg_64 = SI_IF %7, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2
+
+  bb.2:
+    SI_END_CF %8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+...
+
 # old reg def is in diff BB - cannot combine
 # CHECK-LABEL: name: old_in_diff_bb
 # CHECK: %3:vgpr_32 = V_MOV_B32_dpp %2, %1, 1, 1, 1, 0, implicit $exec
author	Jay Foad <jay.foad@gmail.com>	2019-07-12 15:59:40 +0000
committer	Jay Foad <jay.foad@gmail.com>	2019-07-12 15:59:40 +0000
commit	27ec195f391cf1bcf39fe25f345bc63061566c3c (patch)
tree	e0f66473063bdf2ced10affcf6292cc96cc49b34
parent	f625a8a250b393b29a277ca790f6bc2fce7aa192 (diff)
download	bcm5719-llvm-27ec195f391cf1bcf39fe25f345bc63061566c3c.tar.gz bcm5719-llvm-27ec195f391cf1bcf39fe25f345bc63061566c3c.zip