diff options
Diffstat (limited to 'polly')
| -rw-r--r-- | polly/lib/CodeGen/PPCGCodeGeneration.cpp | 63 | ||||
| -rw-r--r-- | polly/test/GPGPU/double-parallel-loop.ll | 2 | ||||
| -rw-r--r-- | polly/test/GPGPU/host-control-flow.ll | 2 | ||||
| -rw-r--r-- | polly/tools/GPURuntime/GPUJIT.c | 135 | ||||
| -rw-r--r-- | polly/tools/GPURuntime/GPUJIT.h | 12 |
5 files changed, 183 insertions, 31 deletions
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp index 3b5a5d3aabe..a5d55ea7ac5 100644 --- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp +++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp @@ -281,7 +281,9 @@ private: /// /// Free the LLVM-IR module corresponding to the kernel and -- if requested -- /// dump its IR to stderr. - void finalizeKernelFunction(); + /// + /// @returns The Assembly string of the kernel. + std::string finalizeKernelFunction(); /// Create code that allocates memory to store arrays on device. void allocateDeviceArrays(); @@ -324,6 +326,19 @@ private: /// @param HostPtr A host pointer specifying the location to copy to. void createCallCopyFromDeviceToHost(Value *DevicePtr, Value *HostPtr, Value *Size); + + /// Create a call to get a kernel from an assembly string. + /// + /// @param Buffer The string describing the kernel. + /// @param Entry The name of the kernel function to call. + /// + /// @returns A pointer to a kernel object + Value *createCallGetKernel(Value *Buffer, Value *Entry); + + /// Create a call to free a GPU kernel. + /// + /// @param GPUKernel THe kernel to free. + void createCallFreeKernel(Value *GPUKernel); }; void GPUNodeBuilder::initializeAfterRTH() { @@ -360,6 +375,41 @@ void GPUNodeBuilder::freeDeviceArrays() { createCallFreeDeviceMemory(Array.second); } +Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Entry) { + const char *Name = "polly_getKernel"; + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Function *F = M->getFunction(Name); + + // If F is not available, declare it. + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector<Type *> Args; + Args.push_back(Builder.getInt8PtrTy()); + Args.push_back(Builder.getInt8PtrTy()); + FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); + F = Function::Create(Ty, Linkage, Name, M); + } + + return Builder.CreateCall(F, {Buffer, Entry}); +} + +void GPUNodeBuilder::createCallFreeKernel(Value *GPUKernel) { + const char *Name = "polly_freeKernel"; + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Function *F = M->getFunction(Name); + + // If F is not available, declare it. + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector<Type *> Args; + Args.push_back(Builder.getInt8PtrTy()); + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); + F = Function::Create(Ty, Linkage, Name, M); + } + + Builder.CreateCall(F, {GPUKernel}); +} + void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) { const char *Name = "polly_freeDeviceMemory"; Module *M = Builder.GetInsertBlock()->getParent()->getParent(); @@ -755,7 +805,12 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { S.invalidateScopArrayInfo(BasePtr, ScopArrayInfo::MK_Array); LocalArrays.clear(); - finalizeKernelFunction(); + std::string ASMString = finalizeKernelFunction(); + std::string Name = "kernel_" + std::to_string(Kernel->id); + Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name); + Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name"); + Value *GPUKernel = createCallGetKernel(KernelString, NameString); + createCallFreeKernel(GPUKernel); } /// Compute the DataLayout string for the NVPTX backend. @@ -943,7 +998,7 @@ std::string GPUNodeBuilder::createKernelASM() { return ASMStream.str(); } -void GPUNodeBuilder::finalizeKernelFunction() { +std::string GPUNodeBuilder::finalizeKernelFunction() { // Verify module. llvm::legacy::PassManager Passes; Passes.add(createVerifierPass()); @@ -967,6 +1022,8 @@ void GPUNodeBuilder::finalizeKernelFunction() { GPUModule.release(); KernelIDs.clear(); + + return Assembly; } namespace { diff --git a/polly/test/GPGPU/double-parallel-loop.ll b/polly/test/GPGPU/double-parallel-loop.ll index cb2e2d65c0b..83d6d5e522a 100644 --- a/polly/test/GPGPU/double-parallel-loop.ll +++ b/polly/test/GPGPU/double-parallel-loop.ll @@ -96,6 +96,8 @@ ; IR-NEXT: %p_dev_array_MemRef_A = call i8* @polly_allocateMemoryForDevice(i64 4194304) ; IR-NEXT: [[HostPtr:%.*]] = bitcast [1024 x float]* %A to i8* ; IR-NEXT: call void @polly_copyFromHostToDevice(i8* [[HostPtr]], i8* %p_dev_array_MemRef_A, i64 4194304) +; IR-NEXT: call i8* @polly_getKernel +; IR-NEXT: call void @polly_freeKernel ; IR-NEXT: [[HostPtr2:%.*]] = bitcast [1024 x float]* %A to i8* ; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_A, i8* [[HostPtr2]], i64 4194304) ; IR-NEXT: call void @polly_freeDeviceMemory(i8* %p_dev_array_MemRef_A) diff --git a/polly/test/GPGPU/host-control-flow.ll b/polly/test/GPGPU/host-control-flow.ll index 911d5cbc896..29ef71bdb23 100644 --- a/polly/test/GPGPU/host-control-flow.ll +++ b/polly/test/GPGPU/host-control-flow.ll @@ -30,6 +30,8 @@ ; IR-LABEL: polly.loop_header: ; preds = %polly.loop_header, %polly.loop_preheader ; IR-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ] +; IR-NEXT: call i8* @polly_getKernel +; IR-NEXT: call void @polly_freeKernel ; IR-NEXT: %polly.indvar_next = add nsw i64 %polly.indvar, 1 ; IR-NEXT: %polly.loop_cond = icmp sle i64 %polly.indvar, 98 ; IR-NEXT: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit diff --git a/polly/tools/GPURuntime/GPUJIT.c b/polly/tools/GPURuntime/GPUJIT.c index 2e5f18c4d49..d2b516a9efa 100644 --- a/polly/tools/GPURuntime/GPUJIT.c +++ b/polly/tools/GPURuntime/GPUJIT.c @@ -17,6 +17,7 @@ #include <dlfcn.h> #include <stdarg.h> #include <stdio.h> +#include <string.h> static int DebugMode; @@ -36,12 +37,9 @@ struct PollyGPUContextT { CUcontext Cuda; }; -struct PollyGPUModuleT { - CUmodule Cuda; -}; - struct PollyGPUFunctionT { CUfunction Cuda; + CUmodule CudaModule; }; struct PollyGPUDevicePtrT { @@ -101,6 +99,10 @@ typedef CUresult CUDAAPI CuModuleLoadDataExFcnTy(CUmodule *, const void *, void **); static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr; +typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *module, + const void *image); +static CuModuleLoadDataFcnTy *CuModuleLoadDataFcnPtr; + typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule, const char *); static CuModuleGetFunctionFcnTy *CuModuleGetFunctionFcnPtr; @@ -111,6 +113,27 @@ static CuDeviceComputeCapabilityFcnTy *CuDeviceComputeCapabilityFcnPtr; typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice); static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr; +typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState state, + CUjitInputType type, void *data, + size_t size, const char *name, + unsigned int numOptions, + CUjit_option *options, + void **optionValues); +static CuLinkAddDataFcnTy *CuLinkAddDataFcnPtr; + +typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int numOptions, + CUjit_option *options, + void **optionValues, + CUlinkState *stateOut); +static CuLinkCreateFcnTy *CuLinkCreateFcnPtr; + +typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState state, void **cubinOut, + size_t *sizeOut); +static CuLinkCompleteFcnTy *CuLinkCompleteFcnPtr; + +typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState state); +static CuLinkDestroyFcnTy *CuLinkDestroyFcnPtr; + /* Type-defines of function pointer ot CUDA runtime APIs. */ typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void); static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr; @@ -198,6 +221,9 @@ static int initialDeviceAPIs() { CuModuleLoadDataExFcnPtr = (CuModuleLoadDataExFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadDataEx"); + CuModuleLoadDataFcnPtr = + (CuModuleLoadDataFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadData"); + CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandle( HandleCuda, "cuModuleGetFunction"); @@ -208,6 +234,18 @@ static int initialDeviceAPIs() { CuDeviceGetNameFcnPtr = (CuDeviceGetNameFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetName"); + CuLinkAddDataFcnPtr = + (CuLinkAddDataFcnTy *)getAPIHandle(HandleCuda, "cuLinkAddData"); + + CuLinkCreateFcnPtr = + (CuLinkCreateFcnTy *)getAPIHandle(HandleCuda, "cuLinkCreate"); + + CuLinkCompleteFcnPtr = + (CuLinkCompleteFcnTy *)getAPIHandle(HandleCuda, "cuLinkComplete"); + + CuLinkDestroyFcnPtr = + (CuLinkDestroyFcnTy *)getAPIHandle(HandleCuda, "cuLinkDestroy"); + /* Get function pointer to CUDA Runtime APIs. */ CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandle( HandleCudaRT, "cudaThreadSynchronize"); @@ -262,38 +300,93 @@ PollyGPUContext *polly_initContext() { return Context; } -void polly_getPTXModule(void *PTXBuffer, PollyGPUModule **Module) { +PollyGPUFunction *polly_getKernel(const char *PTXBuffer, + const char *KernelName) { dump_function(); - *Module = malloc(sizeof(PollyGPUModule)); - if (*Module == 0) { - fprintf(stdout, "Allocate memory for Polly GPU module failed.\n"); + PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction)); + + if (Function == 0) { + fprintf(stdout, "Allocate memory for Polly GPU function failed.\n"); exit(-1); } - if (CuModuleLoadDataExFcnPtr(&((*Module)->Cuda), PTXBuffer, 0, 0, 0) != - CUDA_SUCCESS) { - fprintf(stdout, "Loading ptx assembly text failed.\n"); + CUresult Res; + CUlinkState LState; + CUjit_option Options[6]; + void *OptionVals[6]; + float Walltime = 0; + unsigned long LogSize = 8192; + char ErrorLog[8192], InfoLog[8192]; + void *CuOut; + size_t OutSize; + + // Setup linker options + // Return walltime from JIT compilation + Options[0] = CU_JIT_WALL_TIME; + OptionVals[0] = (void *)&Walltime; + // Pass a buffer for info messages + Options[1] = CU_JIT_INFO_LOG_BUFFER; + OptionVals[1] = (void *)InfoLog; + // Pass the size of the info buffer + Options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; + OptionVals[2] = (void *)LogSize; + // Pass a buffer for error message + Options[3] = CU_JIT_ERROR_LOG_BUFFER; + OptionVals[3] = (void *)ErrorLog; + // Pass the size of the error buffer + Options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; + OptionVals[4] = (void *)LogSize; + // Make the linker verbose + Options[5] = CU_JIT_LOG_VERBOSE; + OptionVals[5] = (void *)1; + + memset(ErrorLog, 0, sizeof(ErrorLog)); + + CuLinkCreateFcnPtr(6, Options, OptionVals, &LState); + Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)PTXBuffer, + strlen(PTXBuffer) + 1, 0, 0, 0, 0); + if (Res != CUDA_SUCCESS) { + fprintf(stdout, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog); exit(-1); } -} -void polly_getPTXKernelEntry(const char *KernelName, PollyGPUModule *Module, - PollyGPUFunction **Kernel) { - dump_function(); + Res = CuLinkCompleteFcnPtr(LState, &CuOut, &OutSize); + if (Res != CUDA_SUCCESS) { + fprintf(stdout, "Complete ptx linker step failed.\n"); + fprintf(stdout, "\n%s\n", ErrorLog); + exit(-1); + } - *Kernel = malloc(sizeof(PollyGPUFunction)); - if (*Kernel == 0) { - fprintf(stdout, "Allocate memory for Polly GPU kernel failed.\n"); + debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n", Walltime, + InfoLog); + + Res = CuModuleLoadDataFcnPtr(&(Function->CudaModule), CuOut); + if (Res != CUDA_SUCCESS) { + fprintf(stdout, "Loading ptx assembly text failed.\n"); exit(-1); } - /* Locate the kernel entry point. */ - if (CuModuleGetFunctionFcnPtr(&((*Kernel)->Cuda), Module->Cuda, KernelName) != - CUDA_SUCCESS) { + Res = CuModuleGetFunctionFcnPtr(&(Function->Cuda), Function->CudaModule, + KernelName); + if (Res != CUDA_SUCCESS) { fprintf(stdout, "Loading kernel function failed.\n"); exit(-1); } + + CuLinkDestroyFcnPtr(LState); + + return Function; +} + +void polly_freeKernel(PollyGPUFunction *Kernel) { + dump_function(); + + if (Kernel->CudaModule) + CuModuleUnloadFcnPtr(Kernel->CudaModule); + + if (Kernel) + free(Kernel); } void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData, diff --git a/polly/tools/GPURuntime/GPUJIT.h b/polly/tools/GPURuntime/GPUJIT.h index 467b832d974..38a8382b150 100644 --- a/polly/tools/GPURuntime/GPUJIT.h +++ b/polly/tools/GPURuntime/GPUJIT.h @@ -44,7 +44,6 @@ * const char *Entry = "_Z8myKernelPi"; * * int main() { - * PollyGPUModule *Module; * PollyGPUFunction *Kernel; * PollyGPUContext *Context; * PollyGPUDevicePtr *DevArray; @@ -58,11 +57,11 @@ * MemSize = 256*64*sizeof(int); * Context = polly_initContext(); * DevArray = polly_allocateMemoryForDevice(MemSize); - * polly_getPTXModule(KernelString, &Module); - * polly_getPTXKernelEntry(Entry, Module, &Kernel); + * Kernel = polly_getKernel(KernelString, KernelName); * polly_setKernelParameters(Kernel, BlockWidth, BlockHeight, DevData); * polly_launchKernel(Kernel, GridWidth, GridHeight); * polly_copyFromDeviceToHost(HostData, DevData, MemSize); + * polly_freeKernel(Kernel); * polly_freeDeviceMemory(DevArray); * polly_freeContext(Context); * } @@ -70,14 +69,13 @@ */ typedef struct PollyGPUContextT PollyGPUContext; -typedef struct PollyGPUModuleT PollyGPUModule; typedef struct PollyGPUFunctionT PollyGPUFunction; typedef struct PollyGPUDevicePtrT PollyGPUDevicePtr; PollyGPUContext *polly_initContext(); -void polly_getPTXModule(void *PTXBuffer, PollyGPUModule **Module); -void polly_getPTXKernelEntry(const char *KernelName, PollyGPUModule *Module, - PollyGPUFunction **Kernel); +PollyGPUFunction *polly_getKernel(const char *PTXBuffer, + const char *KernelName); +void polly_freeKernel(PollyGPUFunction *Kernel); void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData, long MemSize); void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData, |

