summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp21
-rw-r--r--llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll29
2 files changed, 50 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 927826c5240..ef662d55cb0 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -42,7 +42,9 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/InitializePasses.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
@@ -372,6 +374,8 @@ private:
AMDGPU::IsaVersion IV;
DenseSet<MachineInstr *> TrackedWaitcntSet;
+ DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
+ MachinePostDominatorTree *PDT;
struct BlockInfo {
MachineBasicBlock *MBB;
@@ -406,6 +410,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
+ AU.addRequired<MachinePostDominatorTree>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -792,6 +797,7 @@ bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
false)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
false)
@@ -1012,6 +1018,13 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
if (MI.mayStore()) {
// FIXME: Should not be relying on memoperands.
for (const MachineMemOperand *Memop : MI.memoperands()) {
+ const Value *Ptr = Memop->getValue();
+ if (SLoadAddresses.count(Ptr)) {
+ addWait(Wait, LGKM_CNT, 0);
+ if (PDT->dominates(MI.getParent(),
+ SLoadAddresses.find(Ptr)->second))
+ SLoadAddresses.erase(Ptr);
+ }
unsigned AS = Memop->getAddrSpace();
if (AS != AMDGPUAS::LOCAL_ADDRESS)
continue;
@@ -1399,6 +1412,13 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
}
}
+ if (TII->isSMRD(Inst)) {
+ for (const MachineMemOperand *Memop : Inst.memoperands()) {
+ const Value *Ptr = Memop->getValue();
+ SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent()));
+ }
+ }
+
// Generate an s_waitcnt instruction to be placed before
// cur_Inst, if needed.
Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
@@ -1448,6 +1468,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
MRI = &MF.getRegInfo();
IV = AMDGPU::getIsaVersion(ST->getCPU());
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ PDT = &getAnalysis<MachinePostDominatorTree>();
ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
for (auto T : inst_counter_types())
diff --git a/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll b/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll
new file mode 100644
index 00000000000..4ba16b4eb30
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll
@@ -0,0 +1,29 @@
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
+
+; GCN-LABEL: BB0_1
+; GCN: s_load_dword s{{[0-9]+}}, s{{\[}}[[ADDR_LO:[0-9]+]]{{\:}}[[ADDR_HI:[0-9]+]]{{\]}}, 0x0
+; GCN: s_waitcnt lgkmcnt(0)
+; GCN: global_store_dword v{{\[}}[[ADDR_LO]]{{\:}}[[ADDR_HI]]{{\]}}, v{{[0-9]+}}, off
+
+define amdgpu_kernel void @zot(i32 addrspace(1)* nocapture %arg, i64 addrspace(1)* nocapture %arg1) {
+bb:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp2 = icmp eq i32 %tmp, 0
+ br i1 %tmp2, label %bb3, label %bb8
+
+bb3: ; preds = %bb
+ %tmp4 = load i32, i32 addrspace(1)* %arg, align 4
+ store i32 0, i32 addrspace(1)* %arg, align 4
+ %tmp5 = zext i32 %tmp4 to i64
+ %tmp6 = load i64, i64 addrspace(1)* %arg1, align 8
+ %tmp7 = add i64 %tmp6, %tmp5
+ store i64 %tmp7, i64 addrspace(1)* %arg1, align 8
+ br label %bb8
+
+bb8: ; preds = %bb3, %bb
+ ret void
+}
+; Function Attrs: nounwind readnone speculatable
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone speculatable }
OpenPOWER on IntegriCloud