AMDGPU: Force inlining if LDS global address is used

These won't work for the forseeable future. These aren't allowed from OpenCL, but IPO optimizations can make them appear. Also directly set the attributes on functions, regardless of the linkage rather than cloning functions like before. llvm-svn: 336587
author: Matt Arsenault <Matthew.Arsenault@amd.com> 2018-07-09 19:22:22 +0000
committer: Matt Arsenault <Matthew.Arsenault@amd.com> 2018-07-09 19:22:22 +0000
commit: 40cb6cab563372d0a0b1bc8c127503508dc114e6 (patch)
tree: 33a3ddf313e3cb7eff250174be7c0bfd7d85db29 /llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
parent: dc73f512ae77888fac7f80af9bdc44a7208fbce8 (diff)
download: bcm5719-llvm-40cb6cab563372d0a0b1bc8c127503508dc114e6.tar.gz
bcm5719-llvm-40cb6cab563372d0a0b1bc8c127503508dc114e6.zip
1 files changed, 87 insertions, 21 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index c27425443ab..d4bbb2c1eb8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -14,6 +14,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 
@@ -30,13 +33,18 @@ static cl::opt<bool> StressCalls(
 class AMDGPUAlwaysInline : public ModulePass {
   bool GlobalOpt;
 
+  void recursivelyVisitUsers(GlobalValue &GV,
+                             SmallPtrSetImpl<Function *> &FuncsToAlwaysInline);
 public:
   static char ID;
 
   AMDGPUAlwaysInline(bool GlobalOpt = false) :
     ModulePass(ID), GlobalOpt(GlobalOpt) { }
   bool runOnModule(Module &M) override;
-  StringRef getPassName() const override { return "AMDGPU Always Inline Pass"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+ }
 };
 
 } // End anonymous namespace
@@ -46,15 +54,53 @@ INITIALIZE_PASS(AMDGPUAlwaysInline, "amdgpu-always-inline",
 
 char AMDGPUAlwaysInline::ID = 0;
 
+void AMDGPUAlwaysInline::recursivelyVisitUsers(
+  GlobalValue &GV,
+  SmallPtrSetImpl<Function *> &FuncsToAlwaysInline) {
+  SmallVector<User *, 16> Stack;
+
+  SmallPtrSet<const Value *, 8> Visited;
+
+  for (User *U : GV.users())
+    Stack.push_back(U);
+
+  while (!Stack.empty()) {
+    User *U = Stack.pop_back_val();
+    if (!Visited.insert(U).second)
+      continue;
+
+    if (Instruction *I = dyn_cast<Instruction>(U)) {
+      Function *F = I->getParent()->getParent();
+      if (!AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
+        FuncsToAlwaysInline.insert(F);
+        Stack.push_back(F);
+      }
+
+      // No need to look at further users, but we do need to inline any callers.
+      continue;
+    }
+
+    for (User *UU : U->users())
+      Stack.push_back(UU);
+  }
+}
+
 bool AMDGPUAlwaysInline::runOnModule(Module &M) {
+  AMDGPUAS AMDGPUAS = AMDGPU::getAMDGPUAS(M);
+
   std::vector<GlobalAlias*> AliasesToRemove;
-  std::vector<Function *> FuncsToClone;
+
+  SmallPtrSet<Function *, 8> FuncsToAlwaysInline;
+  SmallPtrSet<Function *, 8> FuncsToNoInline;
 
   for (GlobalAlias &A : M.aliases()) {
     if (Function* F = dyn_cast<Function>(A.getAliasee())) {
       A.replaceAllUsesWith(F);
       AliasesToRemove.push_back(&A);
     }
+
+    // FIXME: If the aliasee isn't a function, it's some kind of constant expr
+    // cast that won't be inlined through.
   }
 
   if (GlobalOpt) {
@@ -63,31 +109,51 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) {
     }
   }
 
-  auto NewAttr = StressCalls ? Attribute::NoInline : Attribute::AlwaysInline;
-  auto IncompatAttr
-    = StressCalls ? Attribute::AlwaysInline : Attribute::NoInline;
-
-  for (Function &F : M) {
-    if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty() &&
-        !F.hasFnAttribute(IncompatAttr))
-      FuncsToClone.push_back(&F);
-  }
-
-  for (Function *F : FuncsToClone) {
-    ValueToValueMapTy VMap;
-    Function *NewFunc = CloneFunction(F, VMap);
-    NewFunc->setLinkage(GlobalValue::InternalLinkage);
-    F->replaceAllUsesWith(NewFunc);
+  // Always force inlining of any function that uses an LDS global address. This
+  // is something of a workaround because we don't have a way of supporting LDS
+  // objects defined in functions. LDS is always allocated by a kernel, and it
+  // is difficult to manage LDS usage if a function may be used by multiple
+  // kernels.
+  //
+  // OpenCL doesn't allow declaring LDS in non-kernels, so in practice this
+  // should only appear when IPO passes manages to move LDs defined in a kernel
+  // into a single user function.
+
+  for (GlobalVariable &GV : M.globals()) {
+    // TODO: Region address
+    unsigned AS = GV.getType()->getAddressSpace();
+    if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS.REGION_ADDRESS)
+      continue;
+
+    recursivelyVisitUsers(GV, FuncsToAlwaysInline);
   }
 
-  for (Function &F : M) {
-    if (F.hasLocalLinkage() && !F.hasFnAttribute(IncompatAttr)) {
-      F.addFnAttr(NewAttr);
+  if (!AMDGPUTargetMachine::EnableFunctionCalls || StressCalls) {
+    auto IncompatAttr
+      = StressCalls ? Attribute::AlwaysInline : Attribute::NoInline;
+
+    for (Function &F : M) {
+      if (!F.isDeclaration() && !F.use_empty() &&
+          !F.hasFnAttribute(IncompatAttr)) {
+        if (StressCalls) {
+          if (!FuncsToAlwaysInline.count(&F))
+            FuncsToNoInline.insert(&F);
+        } else
+          FuncsToAlwaysInline.insert(&F);
+      }
     }
   }
-  return false;
+
+  for (Function *F : FuncsToAlwaysInline)
+    F->addFnAttr(Attribute::AlwaysInline);
+
+  for (Function *F : FuncsToNoInline)
+    F->addFnAttr(Attribute::NoInline);
+
+  return !FuncsToAlwaysInline.empty() || !FuncsToNoInline.empty();
 }
 
 ModulePass *llvm::createAMDGPUAlwaysInlinePass(bool GlobalOpt) {
   return new AMDGPUAlwaysInline(GlobalOpt);
 }
+
author	Matt Arsenault <Matthew.Arsenault@amd.com>	2018-07-09 19:22:22 +0000
committer	Matt Arsenault <Matthew.Arsenault@amd.com>	2018-07-09 19:22:22 +0000
commit	40cb6cab563372d0a0b1bc8c127503508dc114e6 (patch)
tree	33a3ddf313e3cb7eff250174be7c0bfd7d85db29 /llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
parent	dc73f512ae77888fac7f80af9bdc44a7208fbce8 (diff)
download	bcm5719-llvm-40cb6cab563372d0a0b1bc8c127503508dc114e6.tar.gz bcm5719-llvm-40cb6cab563372d0a0b1bc8c127503508dc114e6.zip