AMDGPU/SI: Move the local memory usage related checking after calling convention checking in PromoteAlloca

Summary: Promoting Alloca to Vector and Promoting Alloca to LDS are two independent handling of Alloca and should not affect each other. As a result, we should not give up promoting to vector if there is not enough LDS. This patch factors out the local memory usage related checking out and replace it after the calling convention checking. Reviewer: arsenm Differential Revision: http://reviews.llvm.org/D33139 llvm-svn: 303684
author: Changpeng Fang <changpeng.fang@gmail.com> 2017-05-23 20:25:41 +0000
committer: Changpeng Fang <changpeng.fang@gmail.com> 2017-05-23 20:25:41 +0000
commit: 1dbace195d29b18dee93bb8f5fd906035e9daf83 (patch)
tree: d63f6bc48aa696ee03d4456f2efd6d42aca7fa0b /llvm/lib/Target
parent: 3334cc017eecbd1eb95272cd5c523bdc01d1ed52 (diff)
download: bcm5719-llvm-1dbace195d29b18dee93bb8f5fd906035e9daf83.tar.gz
bcm5719-llvm-1dbace195d29b18dee93bb8f5fd906035e9daf83.zip
1 files changed, 114 insertions, 99 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 85184b36390..07f92918a43 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -97,6 +97,9 @@ private:
                                        Instruction *UseInst,
                                        int OpIdx0, int OpIdx1) const;
 
+  /// Check whether we have enough local memory for promotion.
+  bool hasSufficientLocalMem(const Function &F);
+
 public:
   static char ID;
 
@@ -107,7 +110,7 @@ public:
 
   StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
 
-  void handleAlloca(AllocaInst &I);
+  bool handleAlloca(AllocaInst &I, bool SufficientLDS);
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
@@ -147,105 +150,21 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
   if (!ST.isPromoteAllocaEnabled())
     return false;
-  AS = AMDGPU::getAMDGPUAS(*F.getParent());
-
-  FunctionType *FTy = F.getFunctionType();
-
-  // If the function has any arguments in the local address space, then it's
-  // possible these arguments require the entire local memory space, so
-  // we cannot use local memory in the pass.
-  for (Type *ParamTy : FTy->params()) {
-    PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
-    if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) {
-      LocalMemLimit = 0;
-      DEBUG(dbgs() << "Function has local memory argument. Promoting to "
-                      "local memory disabled.\n");
-      return false;
-    }
-  }
-
-  LocalMemLimit = ST.getLocalMemorySize();
-  if (LocalMemLimit == 0)
-    return false;
-
-  const DataLayout &DL = Mod->getDataLayout();
-
-  // Check how much local memory is being used by global objects
-  CurrentLocalMemUsage = 0;
-  for (GlobalVariable &GV : Mod->globals()) {
-    if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS)
-      continue;
-
-    for (const User *U : GV.users()) {
-      const Instruction *Use = dyn_cast<Instruction>(U);
-      if (!Use)
-        continue;
-
-      if (Use->getParent()->getParent() == &F) {
-        unsigned Align = GV.getAlignment();
-        if (Align == 0)
-          Align = DL.getABITypeAlignment(GV.getValueType());
 
-        // FIXME: Try to account for padding here. The padding is currently
-        // determined from the inverse order of uses in the function. I'm not
-        // sure if the use list order is in any way connected to this, so the
-        // total reported size is likely incorrect.
-        uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
-        CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
-        CurrentLocalMemUsage += AllocSize;
-        break;
-      }
-    }
-  }
-
-  unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
-                                                          F);
-
-  // Restrict local memory usage so that we don't drastically reduce occupancy,
-  // unless it is already significantly reduced.
-
-  // TODO: Have some sort of hint or other heuristics to guess occupancy based
-  // on other factors..
-  unsigned OccupancyHint = ST.getWavesPerEU(F).second;
-  if (OccupancyHint == 0)
-    OccupancyHint = 7;
-
-  // Clamp to max value.
-  OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
-
-  // Check the hint but ignore it if it's obviously wrong from the existing LDS
-  // usage.
-  MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
-
-
-  // Round up to the next tier of usage.
-  unsigned MaxSizeWithWaveCount
-    = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
-
-  // Program is possibly broken by using more local mem than available.
-  if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
-    return false;
-
-  LocalMemLimit = MaxSizeWithWaveCount;
-
-  DEBUG(
-    dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"
-    << "  Rounding size to " << MaxSizeWithWaveCount
-    << " with a maximum occupancy of " << MaxOccupancy << '\n'
-    << " and " << (LocalMemLimit - CurrentLocalMemUsage)
-    << " available for promotion\n"
-  );
+  AS = AMDGPU::getAMDGPUAS(*F.getParent());
 
+  bool SufficientLDS = hasSufficientLocalMem(F);
+  bool Changed = false;
   BasicBlock &EntryBB = *F.begin();
   for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) {
     AllocaInst *AI = dyn_cast<AllocaInst>(I);
 
     ++I;
     if (AI)
-      handleAlloca(*AI);
+      Changed |= handleAlloca(*AI, SufficientLDS);
   }
 
-  return true;
+  return Changed;
 }
 
 std::pair<Value *, Value *>
@@ -661,12 +580,105 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
   return true;
 }
 
+bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
+
+  FunctionType *FTy = F.getFunctionType();
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
+
+  // If the function has any arguments in the local address space, then it's
+  // possible these arguments require the entire local memory space, so
+  // we cannot use local memory in the pass.
+  for (Type *ParamTy : FTy->params()) {
+    PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
+    if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) {
+      LocalMemLimit = 0;
+      DEBUG(dbgs() << "Function has local memory argument. Promoting to "
+                      "local memory disabled.\n");
+      return false;
+    }
+  }
+
+  LocalMemLimit = ST.getLocalMemorySize();
+  if (LocalMemLimit == 0)
+    return false;
+
+  const DataLayout &DL = Mod->getDataLayout();
+
+  // Check how much local memory is being used by global objects
+  CurrentLocalMemUsage = 0;
+  for (GlobalVariable &GV : Mod->globals()) {
+    if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS)
+      continue;
+
+    for (const User *U : GV.users()) {
+      const Instruction *Use = dyn_cast<Instruction>(U);
+      if (!Use)
+        continue;
+
+      if (Use->getParent()->getParent() == &F) {
+        unsigned Align = GV.getAlignment();
+        if (Align == 0)
+          Align = DL.getABITypeAlignment(GV.getValueType());
+
+        // FIXME: Try to account for padding here. The padding is currently
+        // determined from the inverse order of uses in the function. I'm not
+        // sure if the use list order is in any way connected to this, so the
+        // total reported size is likely incorrect.
+        uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
+        CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
+        CurrentLocalMemUsage += AllocSize;
+        break;
+      }
+    }
+  }
+
+  unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
+                                                          F);
+
+  // Restrict local memory usage so that we don't drastically reduce occupancy,
+  // unless it is already significantly reduced.
+
+  // TODO: Have some sort of hint or other heuristics to guess occupancy based
+  // on other factors..
+  unsigned OccupancyHint = ST.getWavesPerEU(F).second;
+  if (OccupancyHint == 0)
+    OccupancyHint = 7;
+
+  // Clamp to max value.
+  OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
+
+  // Check the hint but ignore it if it's obviously wrong from the existing LDS
+  // usage.
+  MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
+
+
+  // Round up to the next tier of usage.
+  unsigned MaxSizeWithWaveCount
+    = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
+
+  // Program is possibly broken by using more local mem than available.
+  if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
+    return false;
+
+  LocalMemLimit = MaxSizeWithWaveCount;
+
+  DEBUG(
+    dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"
+    << "  Rounding size to " << MaxSizeWithWaveCount
+    << " with a maximum occupancy of " << MaxOccupancy << '\n'
+    << " and " << (LocalMemLimit - CurrentLocalMemUsage)
+    << " available for promotion\n"
+  );
+
+  return true;
+}
+
 // FIXME: Should try to pick the most likely to be profitable allocas first.
-void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
+bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
   // Array allocations are probably not worth handling, since an allocation of
   // the array type is the canonical form.
   if (!I.isStaticAlloca() || I.isArrayAllocation())
-    return;
+    return false;
 
   IRBuilder<> Builder(&I);
 
@@ -675,10 +687,8 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
 
   DEBUG(dbgs() << "Trying to promote " << I << '\n');
 
-  if (tryPromoteAllocaToVector(&I, AS)) {
-    DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
-    return;
-  }
+  if (tryPromoteAllocaToVector(&I, AS))
+    return true; // Promoted to vector.
 
   const Function &ContainingFunction = *I.getParent()->getParent();
   CallingConv::ID CC = ContainingFunction.getCallingConv();
@@ -692,9 +702,13 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
     break;
   default:
     DEBUG(dbgs() << " promote alloca to LDS not supported with calling convention.\n");
-    return;
+    return false;
   }
 
+  // Not likely to have sufficient local memory for promotion.
+  if (!SufficientLDS)
+    return false;
+
   const AMDGPUSubtarget &ST =
     TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction);
   unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
@@ -718,7 +732,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
   if (NewSize > LocalMemLimit) {
     DEBUG(dbgs() << "  " << AllocSize
           << " bytes of local memory not available to promote\n");
-    return;
+    return false;
   }
 
   CurrentLocalMemUsage = NewSize;
@@ -727,7 +741,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
 
   if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
     DEBUG(dbgs() << " Do not know how to convert all uses\n");
-    return;
+    return false;
   }
 
   DEBUG(dbgs() << "Promoting alloca to local memory\n");
@@ -873,6 +887,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
       llvm_unreachable("Don't know how to promote alloca intrinsic use.");
     }
   }
+  return true;
 }
 
 FunctionPass *llvm::createAMDGPUPromoteAlloca() {
author	Changpeng Fang <changpeng.fang@gmail.com>	2017-05-23 20:25:41 +0000
committer	Changpeng Fang <changpeng.fang@gmail.com>	2017-05-23 20:25:41 +0000
commit	1dbace195d29b18dee93bb8f5fd906035e9daf83 (patch)
tree	d63f6bc48aa696ee03d4456f2efd6d42aca7fa0b /llvm/lib/Target
parent	3334cc017eecbd1eb95272cd5c523bdc01d1ed52 (diff)
download	bcm5719-llvm-1dbace195d29b18dee93bb8f5fd906035e9daf83.tar.gz bcm5719-llvm-1dbace195d29b18dee93bb8f5fd906035e9daf83.zip