summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
authorYaxun Liu <Yaxun.Liu@amd.com>2017-10-10 19:39:48 +0000
committerYaxun Liu <Yaxun.Liu@amd.com>2017-10-10 19:39:48 +0000
commitde4b88d9a1b961048864a17123b7a36974dc2b0d (patch)
tree83c80ffa7392191f48942322a16c8044f4401291 /llvm/lib
parent0f9e8898816020044082065b687bb730789ca918 (diff)
downloadbcm5719-llvm-de4b88d9a1b961048864a17123b7a36974dc2b0d.tar.gz
bcm5719-llvm-de4b88d9a1b961048864a17123b7a36974dc2b0d.zip
[AMDGPU] Lower enqueued blocks and generate runtime metadata
This patch adds a post-linking pass which replaces the function pointer of enqueued block kernel with a global variable (runtime handle) and adds runtime-handle attribute to the enqueued block kernel. In LLVM CodeGen the runtime-handle metadata will be translated to RuntimeHandle metadata in code object. Runtime allocates a global buffer for each kernel with RuntimeHandel metadata and saves the kernel address required for the AQL packet into the buffer. __enqueue_kernel function in device library knows that the invoke function pointer in the block literal is actually runtime handle and loads the kernel address from it and puts it into AQL packet for dispatching. This cannot be done in FE since FE cannot create a unique global variable with external linkage across LLVM modules. The global variable with internal linkage does not work since optimization passes will try to replace loads of the global variable with its initialization value. Differential Revision: https://reviews.llvm.org/D38610 llvm-svn: 315352
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Support/AMDGPUCodeObjectMetadata.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.h4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp98
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/CMakeLists.txt1
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp4
6 files changed, 113 insertions, 0 deletions
diff --git a/llvm/lib/Support/AMDGPUCodeObjectMetadata.cpp b/llvm/lib/Support/AMDGPUCodeObjectMetadata.cpp
index 863093ab7de..1872a003058 100644
--- a/llvm/lib/Support/AMDGPUCodeObjectMetadata.cpp
+++ b/llvm/lib/Support/AMDGPUCodeObjectMetadata.cpp
@@ -96,6 +96,8 @@ struct MappingTraits<Kernel::Attrs::Metadata> {
MD.mWorkGroupSizeHint, std::vector<uint32_t>());
YIO.mapOptional(Kernel::Attrs::Key::VecTypeHint,
MD.mVecTypeHint, std::string());
+ YIO.mapOptional(Kernel::Attrs::Key::RuntimeHandle, MD.mRuntimeHandle,
+ std::string());
}
};
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 6bf8cdcb849..e8f7476dd76 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -186,6 +186,10 @@ void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &);
Pass *createAMDGPUFunctionInliningPass();
void initializeAMDGPUInlinerPass(PassRegistry&);
+ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringPass();
+void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &);
+extern char &AMDGPUOpenCLEnqueuedBlockLoweringID;
+
Target &getTheAMDGPUTarget();
Target &getTheGCNTarget();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
new file mode 100644
index 00000000000..68a204fca23
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
@@ -0,0 +1,98 @@
+//===- AMDGPUOpenCLEnqueuedBlockLowering.cpp - Lower enqueued block -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// \brief This post-linking pass replaces the function pointer of enqueued
+// block kernel with a global variable (runtime handle) and adds
+// "runtime-handle" attribute to the enqueued block kernel.
+//
+// In LLVM CodeGen the runtime-handle metadata will be translated to
+// RuntimeHandle metadata in code object. Runtime allocates a global buffer
+// for each kernel with RuntimeHandel metadata and saves the kernel address
+// required for the AQL packet into the buffer. __enqueue_kernel function
+// in device library knows that the invoke function pointer in the block
+// literal is actually runtime handle and loads the kernel address from it
+// and put it into AQL packet for dispatching.
+//
+// This cannot be done in FE since FE cannot create a unique global variable
+// with external linkage across LLVM modules. The global variable with internal
+// linkage does not work since optimization passes will try to replace loads
+// of the global variable with its initialization value.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "amdgpu-lower-enqueued-block"
+
+using namespace llvm;
+
+namespace {
+
+/// \brief Lower enqueued blocks.
+class AMDGPUOpenCLEnqueuedBlockLowering : public ModulePass {
+public:
+ static char ID;
+
+ explicit AMDGPUOpenCLEnqueuedBlockLowering() : ModulePass(ID) {}
+
+private:
+ bool runOnModule(Module &M) override;
+};
+
+} // end anonymous namespace
+
+char AMDGPUOpenCLEnqueuedBlockLowering::ID = 0;
+
+char &llvm::AMDGPUOpenCLEnqueuedBlockLoweringID =
+ AMDGPUOpenCLEnqueuedBlockLowering::ID;
+
+INITIALIZE_PASS(AMDGPUOpenCLEnqueuedBlockLowering, DEBUG_TYPE,
+ "Lower OpenCL enqueued blocks", false, false)
+
+ModulePass* llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass() {
+ return new AMDGPUOpenCLEnqueuedBlockLowering();
+}
+
+bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
+ auto &C = M.getContext();
+ auto AS = AMDGPU::getAMDGPUAS(M);
+ bool Changed = false;
+ for (auto &F : M.functions()) {
+ if (F.hasFnAttribute("enqueued-block")) {
+ if (!F.hasOneUse() || !F.user_begin()->hasOneUse() ||
+ !isa<ConstantExpr>(*F.user_begin()) ||
+ !isa<ConstantExpr>(*F.user_begin()->user_begin())) {
+ continue;
+ }
+ auto *BitCast = cast<ConstantExpr>(*F.user_begin());
+ auto *AddrCast = cast<ConstantExpr>(*BitCast->user_begin());
+ auto RuntimeHandle = (F.getName() + "_runtime_handle").str();
+ auto *GV = new GlobalVariable(
+ M, Type::getInt8Ty(C)->getPointerTo(AS.GLOBAL_ADDRESS),
+ /*IsConstant=*/true, GlobalValue::ExternalLinkage,
+ /*Initializer=*/nullptr, RuntimeHandle, /*InsertBefore=*/nullptr,
+ GlobalValue::NotThreadLocal, AS.GLOBAL_ADDRESS,
+ /*IsExternallyInitialized=*/true);
+ DEBUG(dbgs() << "runtime handle created: " << *GV << '\n');
+ auto *NewPtr = ConstantExpr::getPointerCast(GV, AddrCast->getType());
+ AddrCast->replaceAllUsesWith(NewPtr);
+ F.addFnAttr("runtime-handle", RuntimeHandle);
+ F.setLinkage(GlobalValue::ExternalLinkage);
+ Changed = true;
+ }
+ }
+ return Changed;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 8a6b5aeaebc..2fdb012243a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -161,6 +161,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUAnnotateUniformValuesPass(*PR);
initializeAMDGPUArgumentUsageInfoPass(*PR);
initializeAMDGPULowerIntrinsicsPass(*PR);
+ initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
initializeAMDGPUPromoteAllocaPass(*PR);
initializeAMDGPUCodeGenPreparePass(*PR);
initializeAMDGPURewriteOutArgumentsPass(*PR);
@@ -610,6 +611,9 @@ void AMDGPUPassConfig::addIRPasses() {
// Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
addPass(createAMDGPUOpenCLImageTypeLoweringPass());
+ // Replace OpenCL enqueued block function pointers with global variables.
+ addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
+
if (TM.getOptLevel() > CodeGenOpt::None) {
addPass(createInferAddressSpacesPass());
addPass(createAMDGPUPromoteAlloca());
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 450835f414a..baefbd3ae05 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -39,6 +39,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUMachineModuleInfo.cpp
AMDGPUMacroFusion.cpp
AMDGPUMCInstLower.cpp
+ AMDGPUOpenCLEnqueuedBlockLowering.cpp
AMDGPUOpenCLImageTypeLoweringPass.cpp
AMDGPUPromoteAlloca.cpp
AMDGPURegAsmNames.inc.cpp
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp
index 4e828a791e0..4a576ca5c0b 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp
@@ -244,6 +244,10 @@ void MetadataStreamer::emitKernelAttrs(const Function &Func) {
cast<ValueAsMetadata>(Node->getOperand(0))->getType(),
mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue());
}
+ if (Func.hasFnAttribute("runtime-handle")) {
+ Attrs.mRuntimeHandle =
+ Func.getFnAttribute("runtime-handle").getValueAsString().str();
+ }
}
void MetadataStreamer::emitKernelArgs(const Function &Func) {
OpenPOWER on IntegriCloud