AMDGPU: Fix broken FrameIndex handling

We were trying to avoid using a FrameIndex operand in non-pointer operands in a convoluted way, and would break because of using TargetFrameIndex. The TargetFrameIndex should only be used in the case where it makes sense to fold it as part of the addressing mode, otherwise it requires materialization like a normal constant. This wasn't working reliably and failed in the added testcase, hitting the assert when processing the frame index. The TargetFrameIndex was coming from trying to produce an AssertZext limiting the maximum stack size. I'm not sure this was correct to begin with, because it is apparently possible to have a single workitem dispatch that requires all 4G of private memory. llvm-svn: 281824
author: Matt Arsenault <Matthew.Arsenault@amd.com> 2016-09-17 16:09:55 +0000
committer: Matt Arsenault <Matthew.Arsenault@amd.com> 2016-09-17 16:09:55 +0000
commit: ac0fc849cf53b41735526eba7931ee4f3508fb8f (patch)
tree: f4cf0befd7d3ba8e6255338d85188df82606185d /llvm/lib/Target
parent: bcfd94c2982e6b8468596390234832653a56fb54 (diff)
download: bcm5719-llvm-ac0fc849cf53b41735526eba7931ee4f3508fb8f.tar.gz
bcm5719-llvm-ac0fc849cf53b41735526eba7931ee4f3508fb8f.zip
5 files changed, 19 insertions, 99 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 951db65efbb..bf3e1da5156 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -51,10 +51,10 @@ public:
   bool runOnMachineFunction(MachineFunction &MF) override;
   void Select(SDNode *N) override;
   const char *getPassName() const override;
-  void PreprocessISelDAG() override;
   void PostprocessISelDAG() override;
 
 private:
+  SDValue foldFrameIndex(SDValue N) const;
   bool isInlineImmediate(const SDNode *N) const;
   bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs,
                    const R600InstrInfo *TII);
@@ -902,6 +902,12 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
   return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE);
 }
 
+SDValue AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
+  if (auto FI = dyn_cast<FrameIndexSDNode>(N))
+    return CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
+  return N;
+}
+
 bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
                                             SDValue &VAddr, SDValue &SOffset,
                                             SDValue &ImmOffset) const {
@@ -921,14 +927,14 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
     // Offsets in vaddr must be positive.
     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
     if (isLegalMUBUFImmOffset(C1)) {
-      VAddr = N0;
+      VAddr = foldFrameIndex(N0);
       ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
       return true;
     }
   }
 
   // (node)
-  VAddr = Addr;
+  VAddr = foldFrameIndex(Addr);
   ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);
   return true;
 }
@@ -1516,62 +1522,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src,
   return SelectVOP3Mods(In, Src, SrcMods);
 }
 
-void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
-  MachineFrameInfo &MFI = CurDAG->getMachineFunction().getFrameInfo();
-
-  // Handle the perverse case where a frame index is being stored. We don't
-  // want to see multiple frame index operands on the same instruction since
-  // it complicates things and violates some assumptions about frame index
-  // lowering.
-  for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
-       I != E; ++I) {
-    SDValue FI = CurDAG->getTargetFrameIndex(I, MVT::i32);
-
-    // It's possible that we have a frame index defined in the function that
-    // isn't used in this block.
-    if (FI.use_empty())
-      continue;
-
-    // Skip over the AssertZext inserted during lowering.
-    SDValue EffectiveFI = FI;
-    auto It = FI->use_begin();
-    if (It->getOpcode() == ISD::AssertZext && FI->hasOneUse()) {
-      EffectiveFI = SDValue(*It, 0);
-      It = EffectiveFI->use_begin();
-    }
-
-    for (auto It = EffectiveFI->use_begin(); !It.atEnd(); ) {
-      SDUse &Use = It.getUse();
-      SDNode *User = Use.getUser();
-      unsigned OpIdx = It.getOperandNo();
-      ++It;
-
-      if (MemSDNode *M = dyn_cast<MemSDNode>(User)) {
-        unsigned PtrIdx = M->getOpcode() == ISD::STORE ? 2 : 1;
-        if (OpIdx == PtrIdx)
-          continue;
-
-        unsigned OpN = M->getNumOperands();
-        SDValue NewOps[8];
-
-        assert(OpN < array_lengthof(NewOps));
-        for (unsigned Op = 0; Op != OpN; ++Op) {
-          if (Op != OpIdx) {
-            NewOps[Op] = M->getOperand(Op);
-            continue;
-          }
-
-          MachineSDNode *Mov = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
-                                                      SDLoc(M), MVT::i32, FI);
-          NewOps[Op] = SDValue(Mov, 0);
-        }
-
-        CurDAG->UpdateNodeOperands(M, makeArrayRef(NewOps, OpN));
-      }
-    }
-  }
-}
-
 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
   const AMDGPUTargetLowering& Lowering =
     *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4570fe585d1..f8d4e6131b0 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -89,7 +89,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
-  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
   setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
 
   setOperationAction(ISD::SELECT, MVT::i1, Promote);
@@ -1558,7 +1557,6 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
-  case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
   case ISD::LOAD: {
     SDValue Result = LowerLOAD(Op, DAG);
@@ -1605,43 +1603,6 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) {
   return nullptr;
 }
 
-SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
-
-  SDLoc SL(Op);
-  FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
-  unsigned FrameIndex = FINode->getIndex();
-
-  // A FrameIndex node represents a 32-bit offset into scratch memory. If the
-  // high bit of a frame index offset were to be set, this would mean that it
-  // represented an offset of ~2GB * 64 = ~128GB from the start of the scratch
-  // buffer, with 64 being the number of threads per wave.
-  //
-  // The maximum private allocation for the entire GPU is 4G, and we are
-  // concerned with the largest the index could ever be for an individual
-  // workitem. This will occur with the minmum dispatch size. If a program
-  // requires more, the dispatch size will be reduced.
-  //
-  // With this limit, we can mark the high bit of the FrameIndex node as known
-  // zero, which is important, because it means in most situations we can prove
-  // that values derived from FrameIndex nodes are non-negative. This enables us
-  // to take advantage of more addressing modes when accessing scratch buffers,
-  // since for scratch reads/writes, the register offset must always be
-  // positive.
-
-  uint64_t MaxGPUAlloc = UINT64_C(4) * 1024 * 1024 * 1024;
-
-  // XXX - It is unclear if partial dispatch works. Assume it works at half wave
-  // granularity. It is probably a full wave.
-  uint64_t MinGranularity = 32;
-
-  unsigned KnownBits = Log2_64(MaxGPUAlloc / MinGranularity);
-  EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), KnownBits);
-
-  SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
-  return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI,
-                     DAG.getValueType(ExtVT));
-}
-
 bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
   if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
     switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 06e7da63a8f..b65f95f7854 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -33,7 +33,6 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 37f8f17bff3..d04ff6a86ec 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -278,6 +278,11 @@ return CurDAG->getTargetConstant(
   N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i32);
 }]>;
 
+def frameindex_to_targetframeindex : SDNodeXForm<frameindex, [{
+  auto FI = cast<FrameIndexSDNode>(N);
+  return CurDAG->getTargetFrameIndex(FI->getIndex(), MVT::i32);
+}]>;
+
 // Copied from the AArch64 backend:
 def bitcast_fpimm_to_i64 : SDNodeXForm<fpimm, [{
 return CurDAG->getTargetConstant(
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 7daa1032f05..86c3fd6815e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1582,6 +1582,11 @@ def : Pat <
 >;
 
 def : Pat <
+ (i32 frameindex:$fi),
+ (V_MOV_B32_e32 (i32 (frameindex_to_targetframeindex $fi)))
+>;
+
+def : Pat <
   (i64 InlineImm<i64>:$imm),
   (S_MOV_B64 InlineImm<i64>:$imm)
 >;
author	Matt Arsenault <Matthew.Arsenault@amd.com>	2016-09-17 16:09:55 +0000
committer	Matt Arsenault <Matthew.Arsenault@amd.com>	2016-09-17 16:09:55 +0000
commit	ac0fc849cf53b41735526eba7931ee4f3508fb8f (patch)
tree	f4cf0befd7d3ba8e6255338d85188df82606185d /llvm/lib/Target
parent	bcfd94c2982e6b8468596390234832653a56fb54 (diff)
download	bcm5719-llvm-ac0fc849cf53b41735526eba7931ee4f3508fb8f.tar.gz bcm5719-llvm-ac0fc849cf53b41735526eba7931ee4f3508fb8f.zip