diff options
| -rw-r--r-- | llvm/lib/Target/R600/AMDGPU.td | 5 | ||||
| -rw-r--r-- | llvm/lib/Target/R600/AMDGPUSubtarget.cpp | 6 | ||||
| -rw-r--r-- | llvm/lib/Target/R600/AMDGPUSubtarget.h | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/R600/Processors.td | 14 | ||||
| -rw-r--r-- | llvm/lib/Target/R600/R600ControlFlowFinalizer.cpp | 43 | ||||
| -rw-r--r-- | llvm/test/CodeGen/R600/cf-stack-bug.ll | 225 | 
6 files changed, 288 insertions, 7 deletions
| diff --git a/llvm/lib/Target/R600/AMDGPU.td b/llvm/lib/Target/R600/AMDGPU.td index c4e5efc8d6e..d1e2cf5319c 100644 --- a/llvm/lib/Target/R600/AMDGPU.td +++ b/llvm/lib/Target/R600/AMDGPU.td @@ -63,6 +63,11 @@ def FeatureCaymanISA : SubtargetFeature<"caymanISA",          "true",          "Use Cayman ISA">; +def FeatureCFALUBug : SubtargetFeature<"cfalubug", +        "CFALUBug", +        "true", +        "GPU has CF_ALU bug">; +  class SubtargetFeatureFetchLimit <string Value> :                            SubtargetFeature <"fetch"#Value,          "TexVTXClauseSize", diff --git a/llvm/lib/Target/R600/AMDGPUSubtarget.cpp b/llvm/lib/Target/R600/AMDGPUSubtarget.cpp index f36aa2071c7..e77ab5e6d14 100644 --- a/llvm/lib/Target/R600/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/R600/AMDGPUSubtarget.cpp @@ -39,6 +39,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) :    EnableIRStructurizer = true;    EnableIfCvt = true;    WavefrontSize = 0; +  CFALUBug = false;    ParseSubtargetFeatures(GPU, FS);    DevName = GPU;  } @@ -97,6 +98,11 @@ AMDGPUSubtarget::getStackEntrySize() const {    }  }  bool +AMDGPUSubtarget::hasCFAluBug() const { +  assert(getGeneration() <= NORTHERN_ISLANDS); +  return CFALUBug; +} +bool  AMDGPUSubtarget::isTargetELF() const {    return false;  } diff --git a/llvm/lib/Target/R600/AMDGPUSubtarget.h b/llvm/lib/Target/R600/AMDGPUSubtarget.h index 68d853218ba..7e7f4d0c004 100644 --- a/llvm/lib/Target/R600/AMDGPUSubtarget.h +++ b/llvm/lib/Target/R600/AMDGPUSubtarget.h @@ -52,6 +52,7 @@ private:    bool EnableIRStructurizer;    bool EnableIfCvt;    unsigned WavefrontSize; +  bool CFALUBug;    InstrItineraryData InstrItins; @@ -71,6 +72,7 @@ public:    bool isIfCvtEnabled() const;    unsigned getWavefrontSize() const;    unsigned getStackEntrySize() const; +  bool hasCFAluBug() const;    virtual bool enableMachineScheduler() const {      return getGeneration() <= NORTHERN_ISLANDS; diff --git a/llvm/lib/Target/R600/Processors.td b/llvm/lib/Target/R600/Processors.td index e601f353163..fde44814970 100644 --- a/llvm/lib/Target/R600/Processors.td +++ b/llvm/lib/Target/R600/Processors.td @@ -46,13 +46,15 @@ def : Proc<"rv770",      R600_VLIW5_Itin,  //===----------------------------------------------------------------------===//  def : Proc<"cedar",      R600_VLIW5_Itin, -    [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize32]>; +    [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize32, +     FeatureCFALUBug]>;  def : Proc<"redwood",    R600_VLIW5_Itin, -    [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64]>; +    [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64, +     FeatureCFALUBug]>;  def : Proc<"sumo",       R600_VLIW5_Itin, -    [FeatureEvergreen, FeatureWavefrontSize64]>; +    [FeatureEvergreen, FeatureWavefrontSize64, FeatureCFALUBug]>;  def : Proc<"juniper",    R600_VLIW5_Itin,      [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64]>; @@ -66,13 +68,13 @@ def : Proc<"cypress",    R600_VLIW5_Itin,  //===----------------------------------------------------------------------===//  def : Proc<"barts",      R600_VLIW5_Itin, -    [FeatureNorthernIslands, FeatureVertexCache]>; +    [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>;  def : Proc<"turks",      R600_VLIW5_Itin, -    [FeatureNorthernIslands, FeatureVertexCache]>; +    [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>;  def : Proc<"caicos",     R600_VLIW5_Itin, -    [FeatureNorthernIslands]>; +    [FeatureNorthernIslands, FeatureCFALUBug]>;  def : Proc<"cayman",     R600_VLIW4_Itin,      [FeatureNorthernIslands, FeatureFP64, FeatureCaymanISA]>; diff --git a/llvm/lib/Target/R600/R600ControlFlowFinalizer.cpp b/llvm/lib/Target/R600/R600ControlFlowFinalizer.cpp index 6b42a7a9faf..470ff2e1079 100644 --- a/llvm/lib/Target/R600/R600ControlFlowFinalizer.cpp +++ b/llvm/lib/Target/R600/R600ControlFlowFinalizer.cpp @@ -73,6 +73,44 @@ bool CFStack::branchStackContains(CFStack::StackItem Item) {    return false;  } +bool CFStack::requiresWorkAroundForInst(unsigned Opcode) { +  if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST.hasCaymanISA() && +      getLoopDepth() > 1) +    return true; + +  if (!ST.hasCFAluBug()) +    return false; + +  switch(Opcode) { +  default: return false; +  case AMDGPU::CF_ALU_PUSH_BEFORE: +  case AMDGPU::CF_ALU_ELSE_AFTER: +  case AMDGPU::CF_ALU_BREAK: +  case AMDGPU::CF_ALU_CONTINUE: +    if (CurrentSubEntries == 0) +      return false; +    if (ST.getWavefrontSize() == 64) { +      // We are being conservative here.  We only require this work-around if +      // CurrentSubEntries > 3 && +      // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0) +      // +      // We have to be conservative, because we don't know for certain that +      // our stack allocation algorithm for Evergreen/NI is correct.  Applying this +      // work-around when CurrentSubEntries > 3 allows us to over-allocate stack +      // resources without any problems. +      return CurrentSubEntries > 3; +    } else { +      assert(ST.getWavefrontSize() == 32); +      // We are being conservative here.  We only require the work-around if +      // CurrentSubEntries > 7 && +      // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0) +      // See the comment on the wavefront size == 64 case for why we are +      // being conservative. +      return CurrentSubEntries > 7; +    } +  } +} +  unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {    switch(Item) {    default: @@ -472,9 +510,12 @@ public:          if (MI->getOpcode() == AMDGPU::CF_ALU)            LastAlu.back() = MI;          I++; +        bool RequiresWorkAround = +            CFStack.requiresWorkAroundForInst(MI->getOpcode());          switch (MI->getOpcode()) {          case AMDGPU::CF_ALU_PUSH_BEFORE: -          if (ST.hasCaymanISA() && CFStack.getLoopDepth() > 1) { +          if (RequiresWorkAround) { +            DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n");              BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG))                  .addImm(CfCount + 1)                  .addImm(1); diff --git a/llvm/test/CodeGen/R600/cf-stack-bug.ll b/llvm/test/CodeGen/R600/cf-stack-bug.ll new file mode 100644 index 00000000000..7fa07b11eea --- /dev/null +++ b/llvm/test/CodeGen/R600/cf-stack-bug.ll @@ -0,0 +1,225 @@ +; RUN: llc -march=r600 -mcpu=redwood -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG64 --check-prefix=FUNC +; RUN: llc -march=r600 -mcpu=sumo -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG64 --check-prefix=FUNC +; RUN: llc -march=r600 -mcpu=barts -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG64 --check-prefix=FUNC +; RUN: llc -march=r600 -mcpu=turks -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG64 --check-prefix=FUNC +; RUN: llc -march=r600 -mcpu=caicos -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG64 --check-prefix=FUNC +; RUN: llc -march=r600 -mcpu=cedar -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG32 --check-prefix=FUNC +; RUN: llc -march=r600 -mcpu=juniper -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=NOBUG --check-prefix=FUNC +; RUN: llc -march=r600 -mcpu=cypress -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=NOBUG --check-prefix=FUNC +; RUN: llc -march=r600 -mcpu=cayman -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=NOBUG --check-prefix=FUNC + +; We are currently allocating 2 extra sub-entries on Evergreen / NI for +; non-WQM push instructions if we change this to 1, then we will need to +; add one level of depth to each of these tests. + +; BUG64-NOT: Applying bug work-around +; BUG32-NOT: Applying bug work-around +; NOBUG-NOT: Applying bug work-around +; FUNC-LABEL: @nested3 +define void @nested3(i32 addrspace(1)* %out, i32 %cond) { +entry: +  %0 = icmp sgt i32 %cond, 0 +  br i1 %0, label %if.1, label %end + +if.1: +  %1 = icmp sgt i32 %cond, 10 +  br i1 %1, label %if.2, label %if.store.1 + +if.store.1: +  store i32 1, i32 addrspace(1)* %out +  br label %end + +if.2: +  %2 = icmp sgt i32 %cond, 20 +  br i1 %2, label %if.3, label %if.2.store + +if.2.store: +  store i32 2, i32 addrspace(1)* %out +  br label %end + +if.3: +  store i32 3, i32 addrspace(1)* %out +  br label %end + +end: +  ret void +} + +; BUG64: Applying bug work-around +; BUG32-NOT: Applying bug work-around +; NOBUG-NOT: Applying bug work-around +; FUNC-LABEL: @nested4 +define void @nested4(i32 addrspace(1)* %out, i32 %cond) { +entry: +  %0 = icmp sgt i32 %cond, 0 +  br i1 %0, label %if.1, label %end + +if.1: +  %1 = icmp sgt i32 %cond, 10 +  br i1 %1, label %if.2, label %if.1.store + +if.1.store: +  store i32 1, i32 addrspace(1)* %out +  br label %end + +if.2: +  %2 = icmp sgt i32 %cond, 20 +  br i1 %2, label %if.3, label %if.2.store + +if.2.store: +  store i32 2, i32 addrspace(1)* %out +  br label %end + +if.3: +  %3 = icmp sgt i32 %cond, 30 +  br i1 %3, label %if.4, label %if.3.store + +if.3.store: +  store i32 3, i32 addrspace(1)* %out +  br label %end + +if.4: +  store i32 4, i32 addrspace(1)* %out +  br label %end + +end: +  ret void +} + +; BUG64: Applying bug work-around +; BUG32-NOT: Applying bug work-around +; NOBUG-NOT: Applying bug work-around +; FUNC-LABEL: @nested7 +define void @nested7(i32 addrspace(1)* %out, i32 %cond) { +entry: +  %0 = icmp sgt i32 %cond, 0 +  br i1 %0, label %if.1, label %end + +if.1: +  %1 = icmp sgt i32 %cond, 10 +  br i1 %1, label %if.2, label %if.1.store + +if.1.store: +  store i32 1, i32 addrspace(1)* %out +  br label %end + +if.2: +  %2 = icmp sgt i32 %cond, 20 +  br i1 %2, label %if.3, label %if.2.store + +if.2.store: +  store i32 2, i32 addrspace(1)* %out +  br label %end + +if.3: +  %3 = icmp sgt i32 %cond, 30 +  br i1 %3, label %if.4, label %if.3.store + +if.3.store: +  store i32 3, i32 addrspace(1)* %out +  br label %end + +if.4: +  %4 = icmp sgt i32 %cond, 40 +  br i1 %4, label %if.5, label %if.4.store + +if.4.store: +  store i32 4, i32 addrspace(1)* %out +  br label %end + +if.5: +  %5 = icmp sgt i32 %cond, 50 +  br i1 %5, label %if.6, label %if.5.store + +if.5.store: +  store i32 5, i32 addrspace(1)* %out +  br label %end + +if.6: +  %6 = icmp sgt i32 %cond, 60 +  br i1 %6, label %if.7, label %if.6.store + +if.6.store: +  store i32 6, i32 addrspace(1)* %out +  br label %end + +if.7: +  store i32 7, i32 addrspace(1)* %out +  br label %end + +end: +  ret void +} + +; BUG64: Applying bug work-around +; BUG32: Applying bug work-around +; NOBUG-NOT: Applying bug work-around +; FUNC-LABEL: @nested8 +define void @nested8(i32 addrspace(1)* %out, i32 %cond) { +entry: +  %0 = icmp sgt i32 %cond, 0 +  br i1 %0, label %if.1, label %end + +if.1: +  %1 = icmp sgt i32 %cond, 10 +  br i1 %1, label %if.2, label %if.1.store + +if.1.store: +  store i32 1, i32 addrspace(1)* %out +  br label %end + +if.2: +  %2 = icmp sgt i32 %cond, 20 +  br i1 %2, label %if.3, label %if.2.store + +if.2.store: +  store i32 2, i32 addrspace(1)* %out +  br label %end + +if.3: +  %3 = icmp sgt i32 %cond, 30 +  br i1 %3, label %if.4, label %if.3.store + +if.3.store: +  store i32 3, i32 addrspace(1)* %out +  br label %end + +if.4: +  %4 = icmp sgt i32 %cond, 40 +  br i1 %4, label %if.5, label %if.4.store + +if.4.store: +  store i32 4, i32 addrspace(1)* %out +  br label %end + +if.5: +  %5 = icmp sgt i32 %cond, 50 +  br i1 %5, label %if.6, label %if.5.store + +if.5.store: +  store i32 5, i32 addrspace(1)* %out +  br label %end + +if.6: +  %6 = icmp sgt i32 %cond, 60 +  br i1 %6, label %if.7, label %if.6.store + +if.6.store: +  store i32 6, i32 addrspace(1)* %out +  br label %end + +if.7: +  %7 = icmp sgt i32 %cond, 70 +  br i1 %7, label %if.8, label %if.7.store + +if.7.store: +  store i32 7, i32 addrspace(1)* %out +  br label %end + +if.8: +  store i32 8, i32 addrspace(1)* %out +  br label %end + +end: +  ret void +} | 

