AMDGPU: More bits of frame index are known to be zero

The maximum private allocation for the whole GPU is 4G, so the maximum possible index for a single workitem is the maximum size divided by the smallest granularity for a dispatch. This increases the number of known zero high bits, which enables more offset folding. The maximum private size per workitem with this is 128M but may be smaller still. llvm-svn: 262153
author: Matt Arsenault <Matthew.Arsenault@amd.com> 2016-02-27 20:26:57 +0000
committer: Matt Arsenault <Matthew.Arsenault@amd.com> 2016-02-27 20:26:57 +0000
commit: 3a61985b2fe922ee7c94c0aa148fad046347829d (patch)
tree: 2c8a3a6d5e4450fb7dd03f0535f7d827a210cd4d /llvm/lib
parent: d6ebd07b8d1ba5fed46790e0246a2d0716c4b63b (diff)
download: bcm5719-llvm-3a61985b2fe922ee7c94c0aa148fad046347829d.tar.gz
bcm5719-llvm-3a61985b2fe922ee7c94c0aa148fad046347829d.zip
4 files changed, 26 insertions, 29 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 0975f97998a..af12ce7360b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -198,14 +198,6 @@ def FeatureMaxPrivateElementSize4 : FeatureMaxPrivateElementSize<4>;
 def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>;
 def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>;
 
-
-def FeatureEnableHugeScratchBuffer : SubtargetFeature<
-  "huge-scratch-buffer",
-  "EnableHugeScratchBuffer",
-  "true",
-  "Enable scratch buffer sizes greater than 128 GB"
->;
-
 def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",
   "EnableVGPRSpilling",
   "true",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index fed46fe3715..96319151765 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -84,7 +84,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
       GCN1Encoding(false), GCN3Encoding(false), CIInsts(false),
       HasSMemRealTime(false), Has16BitInsts(false),
       LDSBankCount(0),
-      IsaVersion(ISAVersion0_0_0), EnableHugeScratchBuffer(false),
+      IsaVersion(ISAVersion0_0_0),
       EnableSIScheduler(false), FrameLowering(nullptr),
       InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) {
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index b9874f8a186..39228945d06 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -93,7 +93,6 @@ private:
   bool FeatureDisable;
   int LDSBankCount;
   unsigned IsaVersion;
-  bool EnableHugeScratchBuffer;
   bool EnableSIScheduler;
 
   std::unique_ptr<AMDGPUFrameLowering> FrameLowering;
@@ -293,10 +292,6 @@ public:
     return false;
   }
 
-  bool enableHugeScratchBuffer() const {
-    return EnableHugeScratchBuffer;
-  }
-
   bool enableSIScheduler() const {
     return EnableSIScheduler;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 46b73f7ed09..d6a69da7ba5 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1178,25 +1178,35 @@ SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
   FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
   unsigned FrameIndex = FINode->getIndex();
 
-  // A FrameIndex node represents a 32-bit offset into scratch memory.  If
-  // the high bit of a frame index offset were to be set, this would mean
-  // that it represented an offset of ~2GB * 64 = ~128GB from the start of the
-  // scratch buffer, with 64 being the number of threads per wave.
+  // A FrameIndex node represents a 32-bit offset into scratch memory. If the
+  // high bit of a frame index offset were to be set, this would mean that it
+  // represented an offset of ~2GB * 64 = ~128GB from the start of the scratch
+  // buffer, with 64 being the number of threads per wave.
   //
-  // If we know the machine uses less than 128GB of scratch, then we can
-  // amrk the high bit of the FrameIndex node as known zero,
-  // which is important, because it means in most situations we can
-  // prove that values derived from FrameIndex nodes are non-negative.
-  // This enables us to take advantage of more addressing modes when
-  // accessing scratch buffers, since for scratch reads/writes, the register
-  // offset must always be positive.
+  // The maximum private allocation for the entire GPU is 4G, and we are
+  // concerned with the largest the index could ever be for an individual
+  // workitem. This will occur with the minmum dispatch size. If a program
+  // requires more, the dispatch size will be reduced.
+  //
+  // With this limit, we can mark the high bit of the FrameIndex node as known
+  // zero, which is important, because it means in most situations we can prove
+  // that values derived from FrameIndex nodes are non-negative. This enables us
+  // to take advantage of more addressing modes when accessing scratch buffers,
+  // since for scratch reads/writes, the register offset must always be
+  // positive.
 
-  SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
-  if (Subtarget->enableHugeScratchBuffer())
-    return TFI;
+  uint64_t MaxGPUAlloc = UINT64_C(4) * 1024 * 1024 * 1024;
+
+  // XXX - It is unclear if partial dispatch works. Assume it works at half wave
+  // granularity. It is probably a full wave.
+  uint64_t MinGranularity = 32;
 
+  unsigned KnownBits = Log2_64(MaxGPUAlloc / MinGranularity);
+  EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), KnownBits);
+
+  SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
   return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI,
-                    DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), 31)));
+                     DAG.getValueType(ExtVT));
 }
 
 bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
author	Matt Arsenault <Matthew.Arsenault@amd.com>	2016-02-27 20:26:57 +0000
committer	Matt Arsenault <Matthew.Arsenault@amd.com>	2016-02-27 20:26:57 +0000
commit	3a61985b2fe922ee7c94c0aa148fad046347829d (patch)
tree	2c8a3a6d5e4450fb7dd03f0535f7d827a210cd4d /llvm/lib
parent	d6ebd07b8d1ba5fed46790e0246a2d0716c4b63b (diff)
download	bcm5719-llvm-3a61985b2fe922ee7c94c0aa148fad046347829d.tar.gz bcm5719-llvm-3a61985b2fe922ee7c94c0aa148fad046347829d.zip