diff options
| author | alex-t <alexander.timofeev@amd.com> | 2020-01-04 18:23:14 +0300 | 
|---|---|---|
| committer | alex-t <alexander.timofeev@amd.com> | 2020-01-04 18:23:14 +0300 | 
| commit | ca8b20ca3ba10288b61a083c4ce57fb011124935 (patch) | |
| tree | 60baa7eb6c2d7dd593da13c3b4749e3806bb9b29 | |
| parent | 6d05bc2e3a9b54fde53aa5cbd83cc7c1d432cac1 (diff) | |
| download | bcm5719-llvm-ca8b20ca3ba10288b61a083c4ce57fb011124935.tar.gz bcm5719-llvm-ca8b20ca3ba10288b61a083c4ce57fb011124935.zip | |
[AMDGPU] need to insert wait between the scalar load and vector store to the same address to avoid WAR conflict.
Reviewers: rampitec, vpykhtin, nhaehnle
Reviewed By: rampitec
Differential Revision: https://reviews.llvm.org/D71934
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 21 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll | 29 | 
2 files changed, 50 insertions, 0 deletions
| diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 927826c5240..ef662d55cb0 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -42,7 +42,9 @@  #include "llvm/CodeGen/MachineInstrBuilder.h"  #include "llvm/CodeGen/MachineMemOperand.h"  #include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachinePostDominators.h"  #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/InitializePasses.h"  #include "llvm/IR/DebugLoc.h"  #include "llvm/Pass.h"  #include "llvm/Support/Debug.h" @@ -372,6 +374,8 @@ private:    AMDGPU::IsaVersion IV;    DenseSet<MachineInstr *> TrackedWaitcntSet; +  DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses; +  MachinePostDominatorTree *PDT;    struct BlockInfo {      MachineBasicBlock *MBB; @@ -406,6 +410,7 @@ public:    void getAnalysisUsage(AnalysisUsage &AU) const override {      AU.setPreservesCFG(); +    AU.addRequired<MachinePostDominatorTree>();      MachineFunctionPass::getAnalysisUsage(AU);    } @@ -792,6 +797,7 @@ bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {  INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,                        false) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)  INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,                      false) @@ -1012,6 +1018,13 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(        if (MI.mayStore()) {          // FIXME: Should not be relying on memoperands.          for (const MachineMemOperand *Memop : MI.memoperands()) { +          const Value *Ptr = Memop->getValue(); +          if (SLoadAddresses.count(Ptr)) { +            addWait(Wait, LGKM_CNT, 0); +            if (PDT->dominates(MI.getParent(), +                               SLoadAddresses.find(Ptr)->second)) +              SLoadAddresses.erase(Ptr); +          }            unsigned AS = Memop->getAddrSpace();            if (AS != AMDGPUAS::LOCAL_ADDRESS)              continue; @@ -1399,6 +1412,13 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,        }      } +    if (TII->isSMRD(Inst)) { +      for (const MachineMemOperand *Memop : Inst.memoperands()) { +        const Value *Ptr = Memop->getValue(); +        SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent())); +      } +    } +      // Generate an s_waitcnt instruction to be placed before      // cur_Inst, if needed.      Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr); @@ -1448,6 +1468,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {    MRI = &MF.getRegInfo();    IV = AMDGPU::getIsaVersion(ST->getCPU());    const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); +  PDT = &getAnalysis<MachinePostDominatorTree>();    ForceEmitZeroWaitcnts = ForceEmitZeroFlag;    for (auto T : inst_counter_types()) diff --git a/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll b/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll new file mode 100644 index 00000000000..4ba16b4eb30 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll @@ -0,0 +1,29 @@ +; RUN: llc  -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN + +; GCN-LABEL: BB0_1 +; GCN: s_load_dword s{{[0-9]+}}, s{{\[}}[[ADDR_LO:[0-9]+]]{{\:}}[[ADDR_HI:[0-9]+]]{{\]}}, 0x0 +; GCN: s_waitcnt lgkmcnt(0) +; GCN: global_store_dword v{{\[}}[[ADDR_LO]]{{\:}}[[ADDR_HI]]{{\]}}, v{{[0-9]+}}, off + +define amdgpu_kernel void @zot(i32 addrspace(1)* nocapture %arg, i64 addrspace(1)* nocapture %arg1) { +bb: +  %tmp = call i32 @llvm.amdgcn.workitem.id.x() +  %tmp2 = icmp eq i32 %tmp, 0 +  br i1 %tmp2, label %bb3, label %bb8 + +bb3:                                              ; preds = %bb +  %tmp4 = load i32, i32 addrspace(1)* %arg, align 4 +  store i32 0, i32 addrspace(1)* %arg, align 4 +  %tmp5 = zext i32 %tmp4 to i64 +  %tmp6 = load i64, i64 addrspace(1)* %arg1, align 8 +  %tmp7 = add i64 %tmp6, %tmp5 +  store i64 %tmp7, i64 addrspace(1)* %arg1, align 8 +  br label %bb8 + +bb8:                                              ; preds = %bb3, %bb +  ret void +} +; Function Attrs: nounwind readnone speculatable +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone speculatable } | 

