summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h6
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp91
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h2
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp54
-rw-r--r--llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir580
-rw-r--r--llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir8
-rw-r--r--llvm/test/CodeGen/AMDGPU/sendmsg-m0-hazard.mir49
7 files changed, 693 insertions, 97 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index f9b400cfe1b..63634f434fa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -806,10 +806,14 @@ public:
return getGeneration() >= AMDGPUSubtarget::GFX9;
}
- bool hasReadM0Hazard() const {
+ bool hasReadM0MovRelInterpHazard() const {
return getGeneration() >= AMDGPUSubtarget::GFX9;
}
+ bool hasReadM0SendMsgHazard() const {
+ return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS;
+ }
+
unsigned getKernArgSegmentSize(const MachineFunction &MF,
unsigned ExplictArgBytes) const;
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 42bd2023c8c..be0588b45e3 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -87,6 +87,18 @@ static bool isSMovRel(unsigned Opcode) {
}
}
+static bool isSendMsgTraceDataOrGDS(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case AMDGPU::S_SENDMSG:
+ case AMDGPU::S_SENDMSGHALT:
+ case AMDGPU::S_TTRACEDATA:
+ return true;
+ default:
+ // TODO: GDS
+ return false;
+ }
+}
+
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
AMDGPU::OpName::simm16);
@@ -100,7 +112,10 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
return NoopHazard;
- if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0)
+ // FIXME: Should flat be considered vmem?
+ if ((SIInstrInfo::isVMEM(*MI) ||
+ SIInstrInfo::isFLAT(*MI))
+ && checkVMEMHazards(MI) > 0)
return NoopHazard;
if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
@@ -124,7 +139,12 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
return NoopHazard;
- if ((TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
+ if (ST.hasReadM0MovRelInterpHazard() &&
+ (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
+ checkReadM0Hazards(MI) > 0)
+ return NoopHazard;
+
+ if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(*MI) &&
checkReadM0Hazards(MI) > 0)
return NoopHazard;
@@ -144,26 +164,20 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
if (SIInstrInfo::isSMRD(*MI))
return std::max(WaitStates, checkSMRDHazards(MI));
- if (SIInstrInfo::isVALU(*MI)) {
- WaitStates = std::max(WaitStates, checkVALUHazards(MI));
+ if (SIInstrInfo::isVALU(*MI))
+ WaitStates = std::max(WaitStates, checkVALUHazards(MI));
- if (SIInstrInfo::isVMEM(*MI))
- WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
+ if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
+ WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
- if (SIInstrInfo::isDPP(*MI))
- WaitStates = std::max(WaitStates, checkDPPHazards(MI));
+ if (SIInstrInfo::isDPP(*MI))
+ WaitStates = std::max(WaitStates, checkDPPHazards(MI));
- if (isDivFMas(MI->getOpcode()))
- WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
+ if (isDivFMas(MI->getOpcode()))
+ WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
- if (isRWLane(MI->getOpcode()))
- WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
-
- if (TII.isVINTRP(*MI))
- WaitStates = std::max(WaitStates, checkReadM0Hazards(MI));
-
- return WaitStates;
- }
+ if (isRWLane(MI->getOpcode()))
+ WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
if (isSGetReg(MI->getOpcode()))
return std::max(WaitStates, checkGetRegHazards(MI));
@@ -174,7 +188,11 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
if (isRFE(MI->getOpcode()))
return std::max(WaitStates, checkRFEHazards(MI));
- if (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()))
+ if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) ||
+ isSMovRel(MI->getOpcode())))
+ return std::max(WaitStates, checkReadM0Hazards(MI));
+
+ if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(*MI))
return std::max(WaitStates, checkReadM0Hazards(MI));
return WaitStates;
@@ -282,12 +300,14 @@ void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
addRegsToSet(TRI, MI.uses(), ClauseUses);
}
-int GCNHazardRecognizer::checkSMEMSoftClauseHazards(MachineInstr *SMEM) {
+int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
// SMEM soft clause are only present on VI+, and only matter if xnack is
// enabled.
if (!ST.isXNACKEnabled())
return 0;
+ bool IsSMRD = TII.isSMRD(*MEM);
+
resetClause();
// A soft-clause is any group of consecutive SMEM instructions. The
@@ -303,7 +323,10 @@ int GCNHazardRecognizer::checkSMEMSoftClauseHazards(MachineInstr *SMEM) {
for (MachineInstr *MI : EmittedInstrs) {
// When we hit a non-SMEM instruction then we have passed the start of the
// clause and we can stop.
- if (!MI || !SIInstrInfo::isSMRD(*MI))
+ if (!MI)
+ break;
+
+ if (IsSMRD != SIInstrInfo::isSMRD(*MI))
break;
addClauseInst(*MI);
@@ -312,13 +335,13 @@ int GCNHazardRecognizer::checkSMEMSoftClauseHazards(MachineInstr *SMEM) {
if (ClauseDefs.none())
return 0;
- // FIXME: When we support stores, we need to make sure not to put loads and
- // stores in the same clause if they use the same address. For now, just
- // start a new clause whenever we see a store.
- if (SMEM->mayStore())
+ // We need to make sure not to put loads and stores in the same clause if they
+ // use the same address. For now, just start a new clause whenever we see a
+ // store.
+ if (MEM->mayStore())
return 1;
- addClauseInst(*SMEM);
+ addClauseInst(*MEM);
// If the set of defs and uses intersect then we cannot add this instruction
// to the clause, so we have a hazard.
@@ -329,7 +352,7 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
int WaitStatesNeeded = 0;
- WaitStatesNeeded = checkSMEMSoftClauseHazards(SMRD);
+ WaitStatesNeeded = checkSoftClauseHazards(SMRD);
// This SMRD hazard only affects SI.
if (ST.getGeneration() != SISubtarget::SOUTHERN_ISLANDS)
@@ -369,18 +392,15 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
}
int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
- const SIInstrInfo *TII = ST.getInstrInfo();
-
if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
return 0;
- const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
// A read of an SGPR by a VMEM instruction requires 5 wait states when the
// SGPR was written by a VALU Instruction.
- int VmemSgprWaitStates = 5;
- int WaitStatesNeeded = 0;
- auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
+ const int VmemSgprWaitStates = 5;
+ auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
for (const MachineOperand &Use : VMEM->uses()) {
if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
@@ -598,11 +618,8 @@ int GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) {
}
int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
- if (!ST.hasReadM0Hazard())
- return 0;
-
const SIInstrInfo *TII = ST.getInstrInfo();
- int SMovRelWaitStates = 1;
+ const int SMovRelWaitStates = 1;
auto IsHazardFn = [TII] (MachineInstr *MI) {
return TII->isSALU(*MI);
};
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index eb382cc8c77..01682acfac4 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -58,7 +58,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
[](MachineInstr *) { return true; });
int getWaitStatesSinceSetReg(function_ref<bool(MachineInstr *)> IsHazard);
- int checkSMEMSoftClauseHazards(MachineInstr *SMEM);
+ int checkSoftClauseHazards(MachineInstr *SMEM);
int checkSMRDHazards(MachineInstr *SMRD);
int checkVMEMHazards(MachineInstr* VMEM);
int checkDPPHazards(MachineInstr *DPP);
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index c41757d5825..2d41d8965b1 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1522,8 +1522,6 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
ScoreBrackets->dump();
});
- bool InsertNOP = false;
-
// Walk over the instructions.
for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
Iter != E;) {
@@ -1624,58 +1622,6 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
VCCZBugHandledSet.insert(&Inst);
}
- if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
-
- // This avoids a s_nop after a waitcnt has just been inserted.
- if (!SWaitInst && InsertNOP) {
- BuildMI(Block, Inst, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
- }
- InsertNOP = false;
-
- // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
- // or SMEM clause, respectively.
- //
- // The temporary workaround is to break the clauses with S_NOP.
- //
- // The proper solution would be to allocate registers such that all source
- // and destination registers don't overlap, e.g. this is illegal:
- // r0 = load r2
- // r2 = load r0
- bool IsSMEM = false;
- bool IsVMEM = false;
- if (TII->isSMRD(Inst))
- IsSMEM = true;
- else if (TII->usesVM_CNT(Inst))
- IsVMEM = true;
-
- ++Iter;
- if (Iter == E)
- break;
-
- MachineInstr &Next = *Iter;
-
- // TODO: How about consecutive SMEM instructions?
- // The comments above says break the clause but the code does not.
- // if ((TII->isSMRD(next) && isSMEM) ||
- if (!IsSMEM && TII->usesVM_CNT(Next) && IsVMEM &&
- // TODO: Enable this check when hasSoftClause is upstreamed.
- // ST->hasSoftClauses() &&
- ST->isXNACKEnabled()) {
- // Insert a NOP to break the clause.
- InsertNOP = true;
- continue;
- }
-
- // There must be "S_NOP 0" between an instruction writing M0 and
- // S_SENDMSG.
- if ((Next.getOpcode() == AMDGPU::S_SENDMSG ||
- Next.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
- Inst.definesRegister(AMDGPU::M0))
- InsertNOP = true;
-
- continue;
- }
-
++Iter;
}
diff --git a/llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir b/llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir
new file mode 100644
index 00000000000..92145d319b1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir
@@ -0,0 +1,580 @@
+# RUN: llc -march=amdgcn -mcpu=carrizo -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK %s
+# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s
+---
+# Trivial clause at beginning of program
+name: trivial_clause_load_flat4_x1
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: trivial_clause_load_flat4_x1
+ ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: S_ENDPGM
+
+ %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ S_ENDPGM
+...
+---
+# Trivial clause at beginning of program
+name: trivial_clause_load_flat4_x2
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: trivial_clause_load_flat4_x2
+ ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %vgpr1 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: S_ENDPGM
+
+ %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr1 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ S_ENDPGM
+...
+---
+# Trivial clause at beginning of program
+name: trivial_clause_load_flat4_x3
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: trivial_clause_load_flat4_x3
+ ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr3_vgpr4, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %vgpr1 = FLAT_LOAD_DWORD %vgpr5_vgpr6, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr7_vgpr8, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: S_ENDPGM
+
+ %vgpr0 = FLAT_LOAD_DWORD %vgpr3_vgpr4, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr1 = FLAT_LOAD_DWORD %vgpr5_vgpr6, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr2 = FLAT_LOAD_DWORD %vgpr7_vgpr8, 0, 0, 0, implicit %exec, implicit %flat_scr
+ S_ENDPGM
+...
+---
+# Trivial clause at beginning of program
+name: trivial_clause_load_flat4_x4
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: trivial_clause_load_flat4_x4
+ ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %vgpr1 = FLAT_LOAD_DWORD %vgpr6_vgpr7, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr8_vgpr9, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %vgpr3 = FLAT_LOAD_DWORD %vgpr10_vgpr11, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: S_ENDPGM
+
+ %vgpr0 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr1 = FLAT_LOAD_DWORD %vgpr6_vgpr7, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr2 = FLAT_LOAD_DWORD %vgpr8_vgpr9, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr3 = FLAT_LOAD_DWORD %vgpr10_vgpr11, 0, 0, 0, implicit %exec, implicit %flat_scr
+ S_ENDPGM
+...
+---
+# Reuse of same input pointer is OK
+
+name: trivial_clause_load_flat4_x2_sameptr
+body: |
+ bb.0:
+ ; GCN-LABEL: name: trivial_clause_load_flat4_x2_sameptr
+ ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %vgpr1 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: S_ENDPGM
+
+ %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr1 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ S_ENDPGM
+...
+---
+# 32-bit load partially clobbers its own ptr reg
+name: flat_load4_overwrite_ptr_lo
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: flat_load4_overwrite_ptr_lo
+ ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: S_ENDPGM
+
+ %vgpr0 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ S_ENDPGM
+...
+---
+# 32-bit load partially clobbers its own ptr reg
+name: flat_load4_overwrite_ptr_hi
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: flat_load4_overwrite_ptr_hi
+ ; GCN: %vgpr1 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: S_ENDPGM
+
+ %vgpr1 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ S_ENDPGM
+...
+---
+# 64-bit load clobbers its own ptr reg
+name: flat_load8_overwrite_ptr
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: flat_load8_overwrite_ptr
+ ; GCN: %vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: S_ENDPGM
+
+ %vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ S_ENDPGM
+...
+---
+# vmcnt has 4 bits, so maximum 16 outstanding loads. The waitcnt
+# breaks the clause.
+
+
+name: break_clause_at_max_clause_size_flat_load4
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: break_clause_at_max_clause_size_flat_load4
+ ; GCN: %vgpr2 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %vgpr3 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %vgpr4 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %vgpr5 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %vgpr6 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %vgpr7 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %vgpr8 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %vgpr9 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %vgpr10 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %vgpr11 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %vgpr12 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %vgpr13 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %vgpr14 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %vgpr15 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %vgpr16 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %vgpr17 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; XNACK-NEXT: S_NOP 0
+ ; GCN-NEXT: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %sgpr0 = S_MOV_B32 %sgpr0, implicit %vgpr2, implicit %vgpr3, implicit %vgpr4, implicit %vgpr5, implicit %vgpr6, implicit %vgpr7, implicit %vgpr8, implicit %vgpr9, implicit %vgpr10, implicit %vgpr11, implicit %vgpr12, implicit %vgpr13, implicit %vgpr14, implicit %vgpr15, implicit %vgpr16, implicit %vgpr17, implicit %vgpr18
+ ; GCN-NEXT: S_ENDPGM
+
+ %vgpr2 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr3 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr4 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr5 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+
+ %vgpr6 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr7 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr8 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr9 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+
+ %vgpr10 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr11 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr12 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr13 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+
+ %vgpr14 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr15 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr16 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr17 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+
+ %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %sgpr0 = S_MOV_B32 %sgpr0, implicit %vgpr2, implicit %vgpr3, implicit %vgpr4, implicit %vgpr5, implicit %vgpr6, implicit %vgpr7, implicit %vgpr8, implicit %vgpr9, implicit %vgpr10, implicit %vgpr11, implicit %vgpr12, implicit %vgpr13, implicit %vgpr14, implicit %vgpr15, implicit %vgpr16, implicit %vgpr17, implicit %vgpr18
+ S_ENDPGM
+...
+---
+
+name: break_clause_simple_load_flat4_lo_ptr
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: break_clause_simple_load_flat4_lo_ptr
+ ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; XNACK-NEXT: S_NOP 0
+ ; GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: S_ENDPGM
+
+ %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ S_ENDPGM
+...
+---
+
+name: break_clause_simple_load_flat4_hi_ptr
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: break_clause_simple_load_flat4_hi_ptr
+ ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; XNACK-NEXT: S_NOP 0
+ ; GCN-NEXT: %vgpr3 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: S_ENDPGM
+
+ %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr3 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ S_ENDPGM
+...
+---
+
+name: break_clause_simple_load_flat8_ptr
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: break_clause_simple_load_flat8_ptr
+ ; GCN: %vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; XNACK-NEXT: S_NOP 0
+ ; GCN-NEXT: %vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: S_ENDPGM
+
+ %vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ S_ENDPGM
+...
+---
+
+
+name: break_clause_simple_load_flat16_ptr
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: break_clause_simple_load_flat16_ptr
+ ; GCN: %vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; XNACK-NEXT: S_NOP 0
+ ; GCN-NEXT: %vgpr2_vgpr3_vgpr4_vgpr5 = FLAT_LOAD_DWORDX4 %vgpr6_vgpr7, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: S_ENDPGM
+ %vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr2_vgpr3_vgpr4_vgpr5 = FLAT_LOAD_DWORDX4 %vgpr6_vgpr7, 0, 0, 0, implicit %exec, implicit %flat_scr
+ S_ENDPGM
+...
+---
+
+# The clause is broken by the waitcnt inserted at the end of the
+# block, so no nop is needed.
+
+
+name: break_clause_block_boundary_load_flat8_ptr
+
+body: |
+ ; GCN-LABEL: name: break_clause_block_boundary_load_flat8_ptr
+ ; GCN: bb.0:
+ ; GCN-NEXT: successors: %bb.1(0x80000000)
+ ; GCN: %vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN: bb.1:
+ ; XNACK-NEXT: S_NOP 0
+ ; GCN-NEXT: %vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: S_ENDPGM
+
+ bb.0:
+ %vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+
+ bb.1:
+ %vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ S_ENDPGM
+...
+---
+# The load clobbers the pointer of the store, so it needs to break.
+
+name: break_clause_store_load_into_ptr_flat4
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: break_clause_store_load_into_ptr_flat4
+ ; GCN: FLAT_STORE_DWORD %vgpr2_vgpr3, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: S_ENDPGM
+
+ FLAT_STORE_DWORD %vgpr2_vgpr3, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ S_ENDPGM
+...
+---
+# The load clobbers the data of the store, so it needs to break.
+# FIXME: Would it be better to s_nop and wait later?
+
+name: break_clause_store_load_into_data_flat4
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: break_clause_store_load_into_data_flat4
+ ; GCN: FLAT_STORE_DWORD %vgpr2_vgpr3, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %vgpr0 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: S_ENDPGM
+
+ FLAT_STORE_DWORD %vgpr2_vgpr3, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr0 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ S_ENDPGM
+...
+---
+# Regular VALU instruction breaks clause, no nop needed
+
+name: valu_inst_breaks_clause
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: valu_inst_breaks_clause
+ ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %vgpr8 = V_MOV_B32_e32 0, implicit %exec
+ ; XNACK-NEXT: S_NOP 0
+ ; GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: S_ENDPGM
+
+ %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr8 = V_MOV_B32_e32 0, implicit %exec
+ %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ S_ENDPGM
+...
+---
+# Regular SALU instruction breaks clause, no nop needed
+
+name: salu_inst_breaks_clause
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: salu_inst_breaks_clause
+ ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %sgpr8 = S_MOV_B32 0
+ ; XNACK-NEXT: S_NOP 0
+ ; GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: S_ENDPGM
+
+ %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %sgpr8 = S_MOV_B32 0
+ %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ S_ENDPGM
+...
+---
+
+name: ds_inst_breaks_clause
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: ds_inst_breaks_clause
+ ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %vgpr8 = DS_READ_B32 %vgpr9, 0, 0, implicit %m0, implicit %exec
+ ; XNACK-NEXT: S_NOP 0
+ ; GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: S_ENDPGM
+
+ %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr8 = DS_READ_B32 %vgpr9, 0, 0, implicit %m0, implicit %exec
+ %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ S_ENDPGM
+...
+---
+
+name: smrd_inst_breaks_clause
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: smrd_inst_breaks_clause
+ ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %sgpr8 = S_LOAD_DWORD_IMM %sgpr0_sgpr1, 0, 0
+ ; GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: S_ENDPGM
+
+ %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %sgpr8 = S_LOAD_DWORD_IMM %sgpr0_sgpr1, 0, 0
+ %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ S_ENDPGM
+...
+---
+# FIXME: Should this be handled?
+name: implicit_use_breaks_clause
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: implicit_use_breaks_clause
+ ; GCN: %vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr, implicit %vgpr4_vgpr5
+ ; XNACK-NEXT: S_NOP 0
+ ; GCN-NEXT: %vgpr4_vgpr5 = FLAT_LOAD_DWORDX2 %vgpr6_vgpr7, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: S_ENDPGM
+
+ %vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr, implicit %vgpr4_vgpr5
+ %vgpr4_vgpr5 = FLAT_LOAD_DWORDX2 %vgpr6_vgpr7, 0, 0, 0, implicit %exec, implicit %flat_scr
+ S_ENDPGM
+...
+---
+name: trivial_clause_load_mubuf4_x2
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: trivial_clause_load_mubuf4_x2
+ ; GCN: %vgpr1 = BUFFER_LOAD_DWORD_OFFEN %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+ ; GCN-NEXT: %vgpr3 = BUFFER_LOAD_DWORD_OFFEN %vgpr4, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+ ; GCN-NEXT: S_ENDPGM
+
+ %vgpr1 = BUFFER_LOAD_DWORD_OFFEN %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+ %vgpr3 = BUFFER_LOAD_DWORD_OFFEN %vgpr4, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+ S_ENDPGM
+...
+---
+name: break_clause_simple_load_mubuf_offen_ptr
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: break_clause_simple_load_mubuf_offen_ptr
+ ; GCN: %vgpr1 = BUFFER_LOAD_DWORD_OFFEN %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+ ; XNACK-NEXT: S_NOP 0
+ ; GCN-NEXT: %vgpr2 = BUFFER_LOAD_DWORD_OFFEN %vgpr3, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+ ; GCN-NEXT: S_ENDPGM
+
+ %vgpr1 = BUFFER_LOAD_DWORD_OFFEN %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+ %vgpr2 = BUFFER_LOAD_DWORD_OFFEN %vgpr3, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+ S_ENDPGM
+...
+---
+# BUFFER instructions overwriting their own inputs is supposedly OK.
+
+name: mubuf_load4_overwrite_ptr
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: mubuf_load4_overwrite_ptr
+ ; GCN: %vgpr0 = BUFFER_LOAD_DWORD_OFFEN %vgpr0, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+ ; GCN-NEXT: %vgpr1 = V_MOV_B32_e32 0, implicit %exec
+ ; GCN-NEXT: %vgpr2 = V_MOV_B32_e32 %vgpr0, implicit %exec
+ ; GCN-NEXT: S_ENDPGM
+ %vgpr0 = BUFFER_LOAD_DWORD_OFFEN %vgpr0, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+ %vgpr1 = V_MOV_B32_e32 0, implicit %exec
+ %vgpr2 = V_MOV_B32_e32 %vgpr0, implicit %exec
+ S_ENDPGM
+...
+---
+# Break a clause from interference between mubuf and flat instructions
+
+name: break_clause_flat_load_mubuf_load
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: break_clause_flat_load_mubuf_load
+ ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; XNACK-NEXT: S_NOP 0
+ ; GCN-NEXT: %vgpr2 = BUFFER_LOAD_DWORD_OFFEN %vgpr1, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+ ; GCN-NEXT: S_ENDPGM
+
+ %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr2 = BUFFER_LOAD_DWORD_OFFEN %vgpr1, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+ S_ENDPGM
+...
+# Break a clause from interference between mubuf and flat instructions
+
+# GCN-LABEL: name: break_clause_mubuf_load_flat_load
+# GCN: bb.0:
+# GCN-NEXT: %vgpr0 = BUFFER_LOAD_DWORD_OFFEN %vgpr1, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4
+# XNACK-NEXT: S_NOP 0
+# GCN-NEXT: %vgpr1 = FLAT_LOAD_DWORD %vgpr2_vgpr3
+# GCN-NEXT: S_ENDPGM
+name: break_clause_mubuf_load_flat_load
+
+body: |
+ bb.0:
+ %vgpr0 = BUFFER_LOAD_DWORD_OFFEN %vgpr1, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+ %vgpr1 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+
+ S_ENDPGM
+...
+---
+
+name: break_clause_atomic_rtn_into_ptr_flat4
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: break_clause_atomic_rtn_into_ptr_flat4
+ ; GCN: %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; XNACK-NEXT: S_NOP 0
+ ; GCN-NEXT: %vgpr4 = FLAT_ATOMIC_ADD_RTN %vgpr5_vgpr6, %vgpr7, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: S_ENDPGM
+
+ %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr4 = FLAT_ATOMIC_ADD_RTN %vgpr5_vgpr6, %vgpr7, 0, 0, implicit %exec, implicit %flat_scr
+ S_ENDPGM
+...
+---
+name: break_clause_atomic_nortn_ptr_load_flat4
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: break_clause_atomic_nortn_ptr_load_flat4
+ ; GCN: FLAT_ATOMIC_ADD %vgpr0_vgpr1, %vgpr2, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr3_vgpr4, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: S_ENDPGM
+
+ FLAT_ATOMIC_ADD %vgpr0_vgpr1, %vgpr2, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr2 = FLAT_LOAD_DWORD %vgpr3_vgpr4, 0, 0, 0, implicit %exec, implicit %flat_scr
+ S_ENDPGM
+...
+---
+
+name: break_clause_atomic_rtn_into_ptr_mubuf4
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: break_clause_atomic_rtn_into_ptr_mubuf4
+ ; GCN: %vgpr1 = BUFFER_LOAD_DWORD_OFFEN %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+ ; XNACK-NEXT: S_NOP 0
+ ; GCN-NEXT: %vgpr2 = BUFFER_ATOMIC_ADD_OFFEN_RTN %vgpr2, %vgpr5, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, implicit %exec
+ ; GCN-NEXT: S_ENDPGM
+
+ %vgpr1 = BUFFER_LOAD_DWORD_OFFEN %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+ %vgpr2 = BUFFER_ATOMIC_ADD_OFFEN_RTN %vgpr2, %vgpr5, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, implicit %exec
+ S_ENDPGM
+...
+---
+
+name: break_clause_atomic_nortn_ptr_load_mubuf4
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: break_clause_atomic_nortn_ptr_load_mubuf4
+ ; GCN: BUFFER_ATOMIC_ADD_OFFEN %vgpr0, %vgpr1, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, implicit %exec
+ ; GCN-NEXT: %vgpr1 = BUFFER_LOAD_DWORD_OFFEN %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+ ; GCN-NEXT: S_ENDPGM
+
+ BUFFER_ATOMIC_ADD_OFFEN %vgpr0, %vgpr1, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, implicit %exec
+ %vgpr1 = BUFFER_LOAD_DWORD_OFFEN %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+ S_ENDPGM
+...
+---
+# Make sure there is no assert on mubuf instructions which do not have
+# vaddr, and don't add register to track.
+name: no_break_clause_mubuf_load_novaddr
+
+body: |
+ bb.0:
+ ; GCN-LABEL: name: no_break_clause_mubuf_load_novaddr
+ ; GCN: %vgpr1 = BUFFER_LOAD_DWORD_OFFSET %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+ ; GCN-NEXT: %vgpr3 = BUFFER_LOAD_DWORD_OFFSET %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+ ; GCN-NEXT: S_ENDPGM
+ %vgpr1 = BUFFER_LOAD_DWORD_OFFSET %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+ %vgpr3 = BUFFER_LOAD_DWORD_OFFSET %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+ S_ENDPGM
+...
+---
+# Loads and stores using different addresses theoretically does not
+# need a nop
+name: mix_load_store_clause
+body: |
+ bb.0:
+ ; GCN-LABEL: name: mix_load_store_clause
+ ; GCN: FLAT_STORE_DWORD %vgpr0_vgpr1, %vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %vgpr10 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; XNACK-NEXT: S_NOP 0
+ ; GCN-NEXT: FLAT_STORE_DWORD %vgpr2_vgpr3, %vgpr6, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %vgpr11 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+
+ FLAT_STORE_DWORD %vgpr0_vgpr1, %vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr10 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ FLAT_STORE_DWORD %vgpr2_vgpr3, %vgpr6, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr11 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ S_ENDPGM
+...
+---
+# Loads and stores using the same address needs a nop.
+
+name: mix_load_store_clause_same_address
+body: |
+ bb.0:
+ ; GCN-LABEL: name: mix_load_store_clause_same_address
+ ; GCN: FLAT_STORE_DWORD %vgpr0_vgpr1, %vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %vgpr10 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; XNACK-NEXT: S_NOP 0
+ ; GCN-NEXT: FLAT_STORE_DWORD %vgpr0_vgpr1, %vgpr6, 0, 0, 0, implicit %exec, implicit %flat_scr
+ ; GCN-NEXT: %vgpr11 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+
+ FLAT_STORE_DWORD %vgpr0_vgpr1, %vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr10 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ FLAT_STORE_DWORD %vgpr0_vgpr1, %vgpr6, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr11 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ S_ENDPGM
+...
diff --git a/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir b/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir
index 15006e5fdca..16d9070849b 100644
--- a/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir
+++ b/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir
@@ -437,22 +437,22 @@ body: |
# GCN-LABEL: bb.0:
# GCN: S_MOV_B32
-# GFX9: S_NOP
+# GFX9-NEXT: S_NOP
# GCN-NEXT: V_INTERP_P1_F32
# GCN-LABEL: bb.1:
# GCN: S_MOV_B32
-# GFX9: S_NOP
+# GFX9-NEXT: S_NOP
# GCN-NEXT: V_INTERP_P2_F32
# GCN-LABEL: bb.2:
# GCN: S_MOV_B32
-# GFX9: S_NOP
+# GFX9-NEXT: S_NOP
# GCN-NEXT: V_INTERP_P1_F32_16bank
# GCN-LABEL: bb.3:
# GCN: S_MOV_B32
-# GFX9: S_NOP
+# GFX9-NEXT: S_NOP
# GCN-NEXT: V_INTERP_MOV_F32
name: v_interp
diff --git a/llvm/test/CodeGen/AMDGPU/sendmsg-m0-hazard.mir b/llvm/test/CodeGen/AMDGPU/sendmsg-m0-hazard.mir
new file mode 100644
index 00000000000..5dfd5aa384f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sendmsg-m0-hazard.mir
@@ -0,0 +1,49 @@
+# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,GFX9 %s
+# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,VI %s
+# RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,CI %s
+# RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,SI %s
+
+---
+name: m0_sendmsg
+body: |
+ ; GCN-LABEL: name: m0_sendmsg
+ ; GCN: %m0 = S_MOV_B32 -1
+ ; VI-NEXT: S_NOP 0
+ ; GFX9-NEXT: S_NOP 0
+ ; GCN-NEXT: S_SENDMSG 3, implicit %exec, implicit %m0
+
+ bb.0:
+ %m0 = S_MOV_B32 -1
+ S_SENDMSG 3, implicit %exec, implicit %m0
+ S_ENDPGM
+...
+---
+
+name: m0_sendmsghalt
+body: |
+ ; GCN-LABEL: name: m0_sendmsghalt
+ ; GCN: %m0 = S_MOV_B32 -1
+ ; VI-NEXT: S_NOP 0
+ ; GFX9-NEXT: S_NOP 0
+ ; GCN-NEXT: S_SENDMSGHALT 3, implicit %exec, implicit %m0
+
+ bb.0:
+ %m0 = S_MOV_B32 -1
+ S_SENDMSGHALT 3, implicit %exec, implicit %m0
+ S_ENDPGM
+...
+---
+
+name: m0_ttracedata
+body: |
+ ; GCN-LABEL: name: m0_ttracedata
+ ; GCN: %m0 = S_MOV_B32 -1
+ ; VI-NEXT: S_NOP 0
+ ; GFX9-NEXT: S_NOP 0
+ ; GCN-NEXT: S_TTRACEDATA implicit %m0
+
+ bb.0:
+ %m0 = S_MOV_B32 -1
+ S_TTRACEDATA implicit %m0
+ S_ENDPGM
+...
OpenPOWER on IntegriCloud