5 files changed, 183 insertions, 31 deletions
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
index 3b5a5d3aabe..a5d55ea7ac5 100644
--- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp
+++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
@@ -281,7 +281,9 @@ private:
   ///
   /// Free the LLVM-IR module corresponding to the kernel and -- if requested --
   /// dump its IR to stderr.
-  void finalizeKernelFunction();
+  ///
+  /// @returns The Assembly string of the kernel.
+  std::string finalizeKernelFunction();
 
   /// Create code that allocates memory to store arrays on device.
   void allocateDeviceArrays();
@@ -324,6 +326,19 @@ private:
   /// @param HostPtr A host pointer specifying the location to copy to.
   void createCallCopyFromDeviceToHost(Value *DevicePtr, Value *HostPtr,
                                       Value *Size);
+
+  /// Create a call to get a kernel from an assembly string.
+  ///
+  /// @param Buffer The string describing the kernel.
+  /// @param Entry  The name of the kernel function to call.
+  ///
+  /// @returns A pointer to a kernel object
+  Value *createCallGetKernel(Value *Buffer, Value *Entry);
+
+  /// Create a call to free a GPU kernel.
+  ///
+  /// @param GPUKernel THe kernel to free.
+  void createCallFreeKernel(Value *GPUKernel);
 };
 
 void GPUNodeBuilder::initializeAfterRTH() {
@@ -360,6 +375,41 @@ void GPUNodeBuilder::freeDeviceArrays() {
     createCallFreeDeviceMemory(Array.second);
 }
 
+Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Entry) {
+  const char *Name = "polly_getKernel";
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  Function *F = M->getFunction(Name);
+
+  // If F is not available, declare it.
+  if (!F) {
+    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+    std::vector<Type *> Args;
+    Args.push_back(Builder.getInt8PtrTy());
+    Args.push_back(Builder.getInt8PtrTy());
+    FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false);
+    F = Function::Create(Ty, Linkage, Name, M);
+  }
+
+  return Builder.CreateCall(F, {Buffer, Entry});
+}
+
+void GPUNodeBuilder::createCallFreeKernel(Value *GPUKernel) {
+  const char *Name = "polly_freeKernel";
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  Function *F = M->getFunction(Name);
+
+  // If F is not available, declare it.
+  if (!F) {
+    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+    std::vector<Type *> Args;
+    Args.push_back(Builder.getInt8PtrTy());
+    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
+    F = Function::Create(Ty, Linkage, Name, M);
+  }
+
+  Builder.CreateCall(F, {GPUKernel});
+}
+
 void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) {
   const char *Name = "polly_freeDeviceMemory";
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
@@ -755,7 +805,12 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
     S.invalidateScopArrayInfo(BasePtr, ScopArrayInfo::MK_Array);
   LocalArrays.clear();
 
-  finalizeKernelFunction();
+  std::string ASMString = finalizeKernelFunction();
+  std::string Name = "kernel_" + std::to_string(Kernel->id);
+  Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name);
+  Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name");
+  Value *GPUKernel = createCallGetKernel(KernelString, NameString);
+  createCallFreeKernel(GPUKernel);
 }
 
 /// Compute the DataLayout string for the NVPTX backend.
@@ -943,7 +998,7 @@ std::string GPUNodeBuilder::createKernelASM() {
   return ASMStream.str();
 }
 
-void GPUNodeBuilder::finalizeKernelFunction() {
+std::string GPUNodeBuilder::finalizeKernelFunction() {
   // Verify module.
   llvm::legacy::PassManager Passes;
   Passes.add(createVerifierPass());
@@ -967,6 +1022,8 @@ void GPUNodeBuilder::finalizeKernelFunction() {
 
   GPUModule.release();
   KernelIDs.clear();
+
+  return Assembly;
 }
 
 namespace {
diff --git a/polly/test/GPGPU/double-parallel-loop.ll b/polly/test/GPGPU/double-parallel-loop.ll
index cb2e2d65c0b..83d6d5e522a 100644
--- a/polly/test/GPGPU/double-parallel-loop.ll
+++ b/polly/test/GPGPU/double-parallel-loop.ll
@@ -96,6 +96,8 @@
 ; IR-NEXT:    %p_dev_array_MemRef_A = call i8* @polly_allocateMemoryForDevice(i64 4194304)
 ; IR-NEXT:    [[HostPtr:%.*]] = bitcast [1024 x float]* %A to i8*
 ; IR-NEXT:    call void @polly_copyFromHostToDevice(i8* [[HostPtr]], i8* %p_dev_array_MemRef_A, i64 4194304)
+; IR-NEXT:    call i8* @polly_getKernel
+; IR-NEXT:    call void @polly_freeKernel
 ; IR-NEXT:    [[HostPtr2:%.*]] = bitcast [1024 x float]* %A to i8*
 ; IR-NEXT:    call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_A, i8* [[HostPtr2]], i64 4194304)
 ; IR-NEXT:    call void @polly_freeDeviceMemory(i8* %p_dev_array_MemRef_A)
diff --git a/polly/test/GPGPU/host-control-flow.ll b/polly/test/GPGPU/host-control-flow.ll
index 911d5cbc896..29ef71bdb23 100644
--- a/polly/test/GPGPU/host-control-flow.ll
+++ b/polly/test/GPGPU/host-control-flow.ll
@@ -30,6 +30,8 @@
 
 ; IR-LABEL: polly.loop_header:                                ; preds = %polly.loop_header, %polly.loop_preheader
 ; IR-NEXT:   %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ]
+; IR-NEXT: call i8* @polly_getKernel
+; IR-NEXT: call void @polly_freeKernel
 ; IR-NEXT:   %polly.indvar_next = add nsw i64 %polly.indvar, 1
 ; IR-NEXT:   %polly.loop_cond = icmp sle i64 %polly.indvar, 98
 ; IR-NEXT:   br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
diff --git a/polly/tools/GPURuntime/GPUJIT.c b/polly/tools/GPURuntime/GPUJIT.c
index 2e5f18c4d49..d2b516a9efa 100644
--- a/polly/tools/GPURuntime/GPUJIT.c
+++ b/polly/tools/GPURuntime/GPUJIT.c
@@ -17,6 +17,7 @@
 #include <dlfcn.h>
 #include <stdarg.h>
 #include <stdio.h>
+#include <string.h>
 
 static int DebugMode;
 
@@ -36,12 +37,9 @@ struct PollyGPUContextT {
   CUcontext Cuda;
 };
 
-struct PollyGPUModuleT {
-  CUmodule Cuda;
-};
-
 struct PollyGPUFunctionT {
   CUfunction Cuda;
+  CUmodule CudaModule;
 };
 
 struct PollyGPUDevicePtrT {
@@ -101,6 +99,10 @@ typedef CUresult CUDAAPI CuModuleLoadDataExFcnTy(CUmodule *, const void *,
                                                  void **);
 static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr;
 
+typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *module,
+                                               const void *image);
+static CuModuleLoadDataFcnTy *CuModuleLoadDataFcnPtr;
+
 typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule,
                                                   const char *);
 static CuModuleGetFunctionFcnTy *CuModuleGetFunctionFcnPtr;
@@ -111,6 +113,27 @@ static CuDeviceComputeCapabilityFcnTy *CuDeviceComputeCapabilityFcnPtr;
 typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice);
 static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr;
 
+typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState state,
+                                            CUjitInputType type, void *data,
+                                            size_t size, const char *name,
+                                            unsigned int numOptions,
+                                            CUjit_option *options,
+                                            void **optionValues);
+static CuLinkAddDataFcnTy *CuLinkAddDataFcnPtr;
+
+typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int numOptions,
+                                           CUjit_option *options,
+                                           void **optionValues,
+                                           CUlinkState *stateOut);
+static CuLinkCreateFcnTy *CuLinkCreateFcnPtr;
+
+typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState state, void **cubinOut,
+                                             size_t *sizeOut);
+static CuLinkCompleteFcnTy *CuLinkCompleteFcnPtr;
+
+typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState state);
+static CuLinkDestroyFcnTy *CuLinkDestroyFcnPtr;
+
 /* Type-defines of function pointer ot CUDA runtime APIs. */
 typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void);
 static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr;
@@ -198,6 +221,9 @@ static int initialDeviceAPIs() {
   CuModuleLoadDataExFcnPtr =
       (CuModuleLoadDataExFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadDataEx");
 
+  CuModuleLoadDataFcnPtr =
+      (CuModuleLoadDataFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadData");
+
   CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandle(
       HandleCuda, "cuModuleGetFunction");
 
@@ -208,6 +234,18 @@ static int initialDeviceAPIs() {
   CuDeviceGetNameFcnPtr =
       (CuDeviceGetNameFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetName");
 
+  CuLinkAddDataFcnPtr =
+      (CuLinkAddDataFcnTy *)getAPIHandle(HandleCuda, "cuLinkAddData");
+
+  CuLinkCreateFcnPtr =
+      (CuLinkCreateFcnTy *)getAPIHandle(HandleCuda, "cuLinkCreate");
+
+  CuLinkCompleteFcnPtr =
+      (CuLinkCompleteFcnTy *)getAPIHandle(HandleCuda, "cuLinkComplete");
+
+  CuLinkDestroyFcnPtr =
+      (CuLinkDestroyFcnTy *)getAPIHandle(HandleCuda, "cuLinkDestroy");
+
   /* Get function pointer to CUDA Runtime APIs. */
   CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandle(
       HandleCudaRT, "cudaThreadSynchronize");
@@ -262,38 +300,93 @@ PollyGPUContext *polly_initContext() {
   return Context;
 }
 
-void polly_getPTXModule(void *PTXBuffer, PollyGPUModule **Module) {
+PollyGPUFunction *polly_getKernel(const char *PTXBuffer,
+                                  const char *KernelName) {
   dump_function();
 
-  *Module = malloc(sizeof(PollyGPUModule));
-  if (*Module == 0) {
-    fprintf(stdout, "Allocate memory for Polly GPU module failed.\n");
+  PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
+
+  if (Function == 0) {
+    fprintf(stdout, "Allocate memory for Polly GPU function failed.\n");
     exit(-1);
   }
 
-  if (CuModuleLoadDataExFcnPtr(&((*Module)->Cuda), PTXBuffer, 0, 0, 0) !=
-      CUDA_SUCCESS) {
-    fprintf(stdout, "Loading ptx assembly text failed.\n");
+  CUresult Res;
+  CUlinkState LState;
+  CUjit_option Options[6];
+  void *OptionVals[6];
+  float Walltime = 0;
+  unsigned long LogSize = 8192;
+  char ErrorLog[8192], InfoLog[8192];
+  void *CuOut;
+  size_t OutSize;
+
+  // Setup linker options
+  // Return walltime from JIT compilation
+  Options[0] = CU_JIT_WALL_TIME;
+  OptionVals[0] = (void *)&Walltime;
+  // Pass a buffer for info messages
+  Options[1] = CU_JIT_INFO_LOG_BUFFER;
+  OptionVals[1] = (void *)InfoLog;
+  // Pass the size of the info buffer
+  Options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
+  OptionVals[2] = (void *)LogSize;
+  // Pass a buffer for error message
+  Options[3] = CU_JIT_ERROR_LOG_BUFFER;
+  OptionVals[3] = (void *)ErrorLog;
+  // Pass the size of the error buffer
+  Options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
+  OptionVals[4] = (void *)LogSize;
+  // Make the linker verbose
+  Options[5] = CU_JIT_LOG_VERBOSE;
+  OptionVals[5] = (void *)1;
+
+  memset(ErrorLog, 0, sizeof(ErrorLog));
+
+  CuLinkCreateFcnPtr(6, Options, OptionVals, &LState);
+  Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)PTXBuffer,
+                            strlen(PTXBuffer) + 1, 0, 0, 0, 0);
+  if (Res != CUDA_SUCCESS) {
+    fprintf(stdout, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog);
     exit(-1);
   }
-}
 
-void polly_getPTXKernelEntry(const char *KernelName, PollyGPUModule *Module,
-                             PollyGPUFunction **Kernel) {
-  dump_function();
+  Res = CuLinkCompleteFcnPtr(LState, &CuOut, &OutSize);
+  if (Res != CUDA_SUCCESS) {
+    fprintf(stdout, "Complete ptx linker step failed.\n");
+    fprintf(stdout, "\n%s\n", ErrorLog);
+    exit(-1);
+  }
 
-  *Kernel = malloc(sizeof(PollyGPUFunction));
-  if (*Kernel == 0) {
-    fprintf(stdout, "Allocate memory for Polly GPU kernel failed.\n");
+  debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n", Walltime,
+              InfoLog);
+
+  Res = CuModuleLoadDataFcnPtr(&(Function->CudaModule), CuOut);
+  if (Res != CUDA_SUCCESS) {
+    fprintf(stdout, "Loading ptx assembly text failed.\n");
     exit(-1);
   }
 
-  /* Locate the kernel entry point. */
-  if (CuModuleGetFunctionFcnPtr(&((*Kernel)->Cuda), Module->Cuda, KernelName) !=
-      CUDA_SUCCESS) {
+  Res = CuModuleGetFunctionFcnPtr(&(Function->Cuda), Function->CudaModule,
+                                  KernelName);
+  if (Res != CUDA_SUCCESS) {
     fprintf(stdout, "Loading kernel function failed.\n");
     exit(-1);
   }
+
+  CuLinkDestroyFcnPtr(LState);
+
+  return Function;
+}
+
+void polly_freeKernel(PollyGPUFunction *Kernel) {
+  dump_function();
+
+  if (Kernel->CudaModule)
+    CuModuleUnloadFcnPtr(Kernel->CudaModule);
+
+  if (Kernel)
+    free(Kernel);
 }
 
 void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
diff --git a/polly/tools/GPURuntime/GPUJIT.h b/polly/tools/GPURuntime/GPUJIT.h
index 467b832d974..38a8382b150 100644
--- a/polly/tools/GPURuntime/GPUJIT.h
+++ b/polly/tools/GPURuntime/GPUJIT.h
@@ -44,7 +44,6 @@
  * const char *Entry = "_Z8myKernelPi";
  *
  * int main() {
- *   PollyGPUModule *Module;
  *   PollyGPUFunction *Kernel;
  *   PollyGPUContext *Context;
  *   PollyGPUDevicePtr *DevArray;
@@ -58,11 +57,11 @@
  *   MemSize = 256*64*sizeof(int);
  *   Context = polly_initContext();
  *   DevArray = polly_allocateMemoryForDevice(MemSize);
- *   polly_getPTXModule(KernelString, &Module);
- *   polly_getPTXKernelEntry(Entry, Module, &Kernel);
+ *   Kernel = polly_getKernel(KernelString, KernelName);
  *   polly_setKernelParameters(Kernel, BlockWidth, BlockHeight, DevData);
  *   polly_launchKernel(Kernel, GridWidth, GridHeight);
  *   polly_copyFromDeviceToHost(HostData, DevData, MemSize);
+ *   polly_freeKernel(Kernel);
  *   polly_freeDeviceMemory(DevArray);
  *   polly_freeContext(Context);
  * }
@@ -70,14 +69,13 @@
  */
 
 typedef struct PollyGPUContextT PollyGPUContext;
-typedef struct PollyGPUModuleT PollyGPUModule;
 typedef struct PollyGPUFunctionT PollyGPUFunction;
 typedef struct PollyGPUDevicePtrT PollyGPUDevicePtr;
 
 PollyGPUContext *polly_initContext();
-void polly_getPTXModule(void *PTXBuffer, PollyGPUModule **Module);
-void polly_getPTXKernelEntry(const char *KernelName, PollyGPUModule *Module,
-                             PollyGPUFunction **Kernel);
+PollyGPUFunction *polly_getKernel(const char *PTXBuffer,
+                                  const char *KernelName);
+void polly_freeKernel(PollyGPUFunction *Kernel);
 void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
                                 long MemSize);
 void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,