diff options
| author | Tobias Grosser <grosser@fim.uni-passau.de> | 2012-06-11 09:25:01 +0000 |
|---|---|---|
| committer | Tobias Grosser <grosser@fim.uni-passau.de> | 2012-06-11 09:25:01 +0000 |
| commit | fb4842ff95dcead7ec3b19513421b0595ba792c9 (patch) | |
| tree | 7f6634756a8b824887e23d0e90e551dd22c65359 | |
| parent | 4b79647a6efbe80e6ffe78d65378cdd4d94c7773 (diff) | |
| download | bcm5719-llvm-fb4842ff95dcead7ec3b19513421b0595ba792c9.tar.gz bcm5719-llvm-fb4842ff95dcead7ec3b19513421b0595ba792c9.zip | |
Add the runtime library for GPGPU code generation.
Contributed by: Yabin Hu <yabin.hwu@gmail.com>
llvm-svn: 158304
| -rw-r--r-- | polly/tools/CMakeLists.txt | 4 | ||||
| -rw-r--r-- | polly/tools/GPURuntime/CMakeLists.txt | 12 | ||||
| -rw-r--r-- | polly/tools/GPURuntime/GPUJIT.c | 355 | ||||
| -rw-r--r-- | polly/tools/GPURuntime/GPUJIT.h | 41 | ||||
| -rw-r--r-- | polly/tools/GPURuntime/Makefile | 16 | ||||
| -rw-r--r-- | polly/tools/Makefile | 7 |
6 files changed, 435 insertions, 0 deletions
diff --git a/polly/tools/CMakeLists.txt b/polly/tools/CMakeLists.txt index 7046b55ec88..4ce60e1a3e8 100644 --- a/polly/tools/CMakeLists.txt +++ b/polly/tools/CMakeLists.txt @@ -1 +1,5 @@ +if (CUDALIB_FOUND) + add_subdirectory(GPURuntime) +endif (CUDALIB_FOUND) + set(LLVM_COMMON_DEPENDS ${LLVM_COMMON_DEPENDS} PARENT_SCOPE) diff --git a/polly/tools/GPURuntime/CMakeLists.txt b/polly/tools/GPURuntime/CMakeLists.txt new file mode 100644 index 00000000000..822e4de1933 --- /dev/null +++ b/polly/tools/GPURuntime/CMakeLists.txt @@ -0,0 +1,12 @@ +set(MODULE TRUE) +set(LLVM_NO_RTTI 1) + +add_polly_library(GPURuntime + GPUJIT.c + ) + +set_target_properties(GPURuntime + PROPERTIES + LINKER_LANGUAGE C + PREFIX "lib" + ) diff --git a/polly/tools/GPURuntime/GPUJIT.c b/polly/tools/GPURuntime/GPUJIT.c new file mode 100644 index 00000000000..f5936f2d155 --- /dev/null +++ b/polly/tools/GPURuntime/GPUJIT.c @@ -0,0 +1,355 @@ +/******************** GPUJIT.cpp - GPUJIT Execution Engine ********************/ +/* */ +/* The LLVM Compiler Infrastructure */ +/* */ +/* This file is distributed under the University of Illinois Open Source */ +/* License. See LICENSE.TXT for details. */ +/* */ +/******************************************************************************/ +/* */ +/* This file implements GPUJIT, a ptx string execution engine for GPU. */ +/* */ +/******************************************************************************/ + +#include "GPUJIT.h" +#include <dlfcn.h> +#include <stdio.h> + +/* Dynamic library handles for the CUDA and CUDA runtime library. */ +static void *HandleCuda; +static void *HandleCudaRT; + +/* Type-defines of function pointer to CUDA driver APIs. */ +typedef CUresult CUDAAPI CuMemAllocFcnTy(CUdeviceptr *, size_t); +static CuMemAllocFcnTy *CuMemAllocFcnPtr; + +typedef CUresult CUDAAPI CuFuncSetBlockShapeFcnTy(CUfunction, int, int, int); +static CuFuncSetBlockShapeFcnTy *CuFuncSetBlockShapeFcnPtr; + +typedef CUresult CUDAAPI CuParamSetvFcnTy(CUfunction, int, void *, + unsigned int); +static CuParamSetvFcnTy *CuParamSetvFcnPtr; + +typedef CUresult CUDAAPI CuParamSetSizeFcnTy(CUfunction, unsigned int); +static CuParamSetSizeFcnTy *CuParamSetSizeFcnPtr; + +typedef CUresult CUDAAPI CuLaunchGridFcnTy(CUfunction, int, int); +static CuLaunchGridFcnTy *CuLaunchGridFcnPtr; + +typedef CUresult CUDAAPI CuMemcpyDtoHFcnTy(void *, CUdeviceptr, size_t); +static CuMemcpyDtoHFcnTy *CuMemcpyDtoHFcnPtr; + +typedef CUresult CUDAAPI CuMemcpyHtoDFcnTy(CUdeviceptr, const void *, size_t); +static CuMemcpyHtoDFcnTy *CuMemcpyHtoDFcnPtr; + +typedef CUresult CUDAAPI CuMemFreeFcnTy(CUdeviceptr); +static CuMemFreeFcnTy *CuMemFreeFcnPtr; + +typedef CUresult CUDAAPI CuModuleUnloadFcnTy(CUmodule); +static CuModuleUnloadFcnTy *CuModuleUnloadFcnPtr; + +typedef CUresult CUDAAPI CuCtxDestroyFcnTy(CUcontext); +static CuCtxDestroyFcnTy *CuCtxDestroyFcnPtr; + +typedef CUresult CUDAAPI CuInitFcnTy(unsigned int); +static CuInitFcnTy *CuInitFcnPtr; + +typedef CUresult CUDAAPI CuDeviceGetCountFcnTy(int *); +static CuDeviceGetCountFcnTy *CuDeviceGetCountFcnPtr; + +typedef CUresult CUDAAPI CuCtxCreateFcnTy(CUcontext *, unsigned int, CUdevice); +static CuCtxCreateFcnTy *CuCtxCreateFcnPtr; + +typedef CUresult CUDAAPI CuDeviceGetFcnTy(CUdevice *, int); +static CuDeviceGetFcnTy *CuDeviceGetFcnPtr; + +typedef CUresult CUDAAPI CuModuleLoadDataExFcnTy(CUmodule *, const void *, + unsigned int, CUjit_option *, + void **); +static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr; + +typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule, + const char *); +static CuModuleGetFunctionFcnTy *CuModuleGetFunctionFcnPtr; + +typedef CUresult CUDAAPI CuDeviceComputeCapabilityFcnTy(int *, int *, CUdevice); +static CuDeviceComputeCapabilityFcnTy *CuDeviceComputeCapabilityFcnPtr; + +typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice); +static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr; + +/* Type-defines of function pointer ot CUDA runtime APIs. */ +typedef cudaError_t CUDARTAPI CudaEventCreateFcnTy(cudaEvent_t *); +static CudaEventCreateFcnTy *CudaEventCreateFcnPtr; + +typedef cudaError_t CUDARTAPI CudaEventRecordFcnTy(cudaEvent_t, + cudaStream_t); +static CudaEventRecordFcnTy *CudaEventRecordFcnPtr; + +typedef cudaError_t CUDARTAPI CudaEventSynchronizeFcnTy(cudaEvent_t); +static CudaEventSynchronizeFcnTy *CudaEventSynchronizeFcnPtr; + +typedef cudaError_t CUDARTAPI CudaEventElapsedTimeFcnTy(float *, cudaEvent_t, + cudaEvent_t); +static CudaEventElapsedTimeFcnTy *CudaEventElapsedTimeFcnPtr; + +typedef cudaError_t CUDARTAPI CudaEventDestroyFcnTy(cudaEvent_t); +static CudaEventDestroyFcnTy *CudaEventDestroyFcnPtr; + +typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void); +static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr; + +static void *getAPIHandle(void *Handle, const char *FuncName) { + char *Err; + void *FuncPtr; + dlerror(); + FuncPtr = dlsym(Handle, FuncName); + if ((Err = dlerror()) != 0) { + fprintf(stdout, "Load CUDA driver API failed: %s. \n", Err); + return 0; + } + return FuncPtr; +} + +static int initialDeviceAPILibraries() { + HandleCuda = dlopen("libcuda.so", RTLD_LAZY); + if (!HandleCuda) { + printf("Cannot open library: %s. \n", dlerror()); + return 0; + } + + HandleCudaRT = dlopen("libcudart.so", RTLD_LAZY); + if (!HandleCudaRT) { + printf("Cannot open library: %s. \n", dlerror()); + return 0; + } + + return 1; +} + +static int initialDeviceAPIs() { + if (initialDeviceAPILibraries() == 0) + return 0; + + /* Get function pointer to CUDA Driver APIs. + * + * Note that compilers conforming to the ISO C standard are required to + * generate a warning if a conversion from a void * pointer to a function + * pointer is attempted as in the following statements. The warning + * of this kind of cast may not be emitted by clang and new versions of gcc + * as it is valid on POSIX 2008. + */ + CuFuncSetBlockShapeFcnPtr = + (CuFuncSetBlockShapeFcnTy *) getAPIHandle(HandleCuda, + "cuFuncSetBlockShape"); + + CuParamSetvFcnPtr = (CuParamSetvFcnTy *) getAPIHandle(HandleCuda, + "cuParamSetv"); + + CuParamSetSizeFcnPtr = (CuParamSetSizeFcnTy *) getAPIHandle(HandleCuda, + "cuParamSetSize"); + + CuLaunchGridFcnPtr = (CuLaunchGridFcnTy *) getAPIHandle(HandleCuda, + "cuLaunchGrid"); + + CuMemAllocFcnPtr = (CuMemAllocFcnTy *) getAPIHandle(HandleCuda, + "cuMemAlloc_v2"); + + CuMemFreeFcnPtr = (CuMemFreeFcnTy *) getAPIHandle(HandleCuda, "cuMemFree_v2"); + + CuMemcpyDtoHFcnPtr = (CuMemcpyDtoHFcnTy *) getAPIHandle(HandleCuda, + "cuMemcpyDtoH_v2"); + + CuMemcpyHtoDFcnPtr = (CuMemcpyHtoDFcnTy *) getAPIHandle(HandleCuda, + "cuMemcpyHtoD_v2"); + + CuModuleUnloadFcnPtr = (CuModuleUnloadFcnTy *) getAPIHandle(HandleCuda, + "cuModuleUnload"); + + CuCtxDestroyFcnPtr = (CuCtxDestroyFcnTy *) getAPIHandle(HandleCuda, + "cuCtxDestroy"); + + CuInitFcnPtr = (CuInitFcnTy *) getAPIHandle(HandleCuda, "cuInit"); + + CuDeviceGetCountFcnPtr = (CuDeviceGetCountFcnTy *) getAPIHandle(HandleCuda, + "cuDeviceGetCount"); + + CuDeviceGetFcnPtr = (CuDeviceGetFcnTy *) getAPIHandle(HandleCuda, + "cuDeviceGet"); + + CuCtxCreateFcnPtr = (CuCtxCreateFcnTy *) getAPIHandle(HandleCuda, + "cuCtxCreate_v2"); + + CuModuleLoadDataExFcnPtr = + (CuModuleLoadDataExFcnTy *) getAPIHandle(HandleCuda, "cuModuleLoadDataEx"); + + CuModuleGetFunctionFcnPtr = + (CuModuleGetFunctionFcnTy *)getAPIHandle(HandleCuda, "cuModuleGetFunction"); + + CuDeviceComputeCapabilityFcnPtr = + (CuDeviceComputeCapabilityFcnTy *)getAPIHandle(HandleCuda, + "cuDeviceComputeCapability"); + + CuDeviceGetNameFcnPtr = + (CuDeviceGetNameFcnTy *) getAPIHandle(HandleCuda, "cuDeviceGetName"); + + /* Get function pointer to CUDA Runtime APIs. */ + CudaEventCreateFcnPtr = + (CudaEventCreateFcnTy *) getAPIHandle(HandleCudaRT, "cudaEventCreate"); + + CudaEventRecordFcnPtr = + (CudaEventRecordFcnTy *) getAPIHandle(HandleCudaRT, "cudaEventRecord"); + + CudaEventSynchronizeFcnPtr = + (CudaEventSynchronizeFcnTy *) getAPIHandle(HandleCudaRT, + "cudaEventSynchronize"); + + CudaEventElapsedTimeFcnPtr = + (CudaEventElapsedTimeFcnTy *) getAPIHandle(HandleCudaRT, + "cudaEventElapsedTime"); + + CudaEventDestroyFcnPtr = + (CudaEventDestroyFcnTy *) getAPIHandle(HandleCudaRT, "cudaEventDestroy"); + + CudaThreadSynchronizeFcnPtr = + (CudaThreadSynchronizeFcnTy *) getAPIHandle(HandleCudaRT, + "cudaThreadSynchronize"); + + return 1; +} + +void polly_initDevice(CUcontext *Context, CUdevice *Device) { + int Major = 0, Minor = 0, DeviceID = 0; + char DeviceName[256]; + int DeviceCount = 0; + + /* Get API handles. */ + if (initialDeviceAPIs() == 0) { + fprintf(stdout, "Getting the \"handle\" for the CUDA driver API failed.\n"); + exit(-1); + } + + if (CuInitFcnPtr(0) != CUDA_SUCCESS) { + fprintf(stdout, "Initializing the CUDA driver API failed.\n"); + exit(-1); + } + + /* Get number of devices that supports CUDA. */ + CuDeviceGetCountFcnPtr(&DeviceCount); + if (DeviceCount == 0) { + fprintf(stdout, "There is no device supporting CUDA.\n"); + exit(-1); + } + + /* We select the 1st device as default. */ + CuDeviceGetFcnPtr(Device, 0); + + /* Get compute capabilities and the device name. */ + CuDeviceComputeCapabilityFcnPtr(&Major, &Minor, *Device); + CuDeviceGetNameFcnPtr(DeviceName, 256, *Device); + fprintf(stderr, "> Running on GPU device %d : %s.\n", DeviceID, DeviceName); + + /* Create context on the device. */ + CuCtxCreateFcnPtr(Context, 0, *Device); +} + +void polly_getPTXModule(void *PTXBuffer, CUmodule *Module) { + if(CuModuleLoadDataExFcnPtr(Module, PTXBuffer, 0, 0, 0) != CUDA_SUCCESS) { + fprintf(stdout, "Loading ptx assembly text failed.\n"); + exit(-1); + } +} + +void polly_getPTXKernelEntry(const char *KernelName, CUmodule *Module, + CUfunction *Kernel) { + /* Locate the kernel entry point. */ + if(CuModuleGetFunctionFcnPtr(Kernel, *Module, KernelName) + != CUDA_SUCCESS) { + fprintf(stdout, "Loading kernel function failed.\n"); + exit(-1); + } +} + +void polly_startTimerByCudaEvent(cudaEvent_t *StartTimer, + cudaEvent_t *StopTimer) { + CudaEventCreateFcnPtr(StartTimer); + CudaEventCreateFcnPtr(StopTimer); + CudaEventRecordFcnPtr(*StartTimer, 0); +} + +void polly_stopTimerByCudaEvent(cudaEvent_t *StartTimer, + cudaEvent_t *StopTimer, float *ElapsedTimes) { + CudaEventRecordFcnPtr(*StopTimer, 0); + CudaEventSynchronizeFcnPtr(*StopTimer); + CudaEventElapsedTimeFcnPtr(ElapsedTimes, *StartTimer, *StopTimer ); + CudaEventDestroyFcnPtr(*StartTimer); + CudaEventDestroyFcnPtr(*StopTimer); + fprintf(stderr, "Processing time: %f (ms).\n", *ElapsedTimes); +} + +void polly_allocateMemoryForHostAndDevice(void **PtrHostData, + CUdeviceptr *PtrDevData, + int MemSize) { + if ((*PtrHostData = (int *)malloc(MemSize)) == 0) { + fprintf(stdout, "Could not allocate host memory.\n"); + exit(-1); + } + CuMemAllocFcnPtr(PtrDevData, MemSize); +} + +void polly_copyFromHostToDevice(CUdeviceptr DevData, void *HostData, + int MemSize) { + CuMemcpyHtoDFcnPtr(DevData, HostData, MemSize); +} + +void polly_copyFromDeviceToHost(void *HostData, CUdeviceptr DevData, + int MemSize) { + if(CuMemcpyDtoHFcnPtr(HostData, DevData, MemSize) != CUDA_SUCCESS) { + fprintf(stdout, "Copying results from device to host memory failed.\n"); + exit(-1); + } +} + +void polly_setKernelParameters(CUfunction *Kernel, int BlockWidth, + int BlockHeight, CUdeviceptr DevData) { + int ParamOffset = 0; + CuFuncSetBlockShapeFcnPtr(*Kernel, BlockWidth, BlockHeight, 1); + CuParamSetvFcnPtr(*Kernel, ParamOffset, &DevData, sizeof(DevData)); + ParamOffset += sizeof(DevData); + CuParamSetSizeFcnPtr(*Kernel, ParamOffset); +} + +void polly_launchKernel(CUfunction *Kernel, int GridWidth, int GridHeight) { + if (CuLaunchGridFcnPtr(*Kernel, GridWidth, GridHeight) != CUDA_SUCCESS) { + fprintf(stdout, "Launching CUDA kernel failed.\n"); + exit(-1); + } + CudaThreadSynchronizeFcnPtr(); + fprintf(stdout, "CUDA kernel launched.\n"); +} + +void polly_cleanupGPGPUResources(void *HostData, CUdeviceptr DevData, + CUmodule *Module, CUcontext *Context) { + if (HostData) { + free(HostData); + HostData = 0; + } + + if (DevData) { + CuMemFreeFcnPtr(DevData); + DevData = 0; + } + + if (*Module) { + CuModuleUnloadFcnPtr(*Module); + *Module = 0; + } + + if (*Context) { + CuCtxDestroyFcnPtr(*Context); + *Context = 0; + } + + dlclose(HandleCuda); + dlclose(HandleCudaRT); +} diff --git a/polly/tools/GPURuntime/GPUJIT.h b/polly/tools/GPURuntime/GPUJIT.h new file mode 100644 index 00000000000..718544faefe --- /dev/null +++ b/polly/tools/GPURuntime/GPUJIT.h @@ -0,0 +1,41 @@ +/******************************************************************************/ +/* */ +/* The LLVM Compiler Infrastructure */ +/* */ +/* This file is distributed under the University of Illinois Open Source */ +/* License. See LICENSE.TXT for details. */ +/* */ +/******************************************************************************/ +/* */ +/* This file defines GPUJIT. */ +/* */ +/******************************************************************************/ + +#ifndef GPUJIT_H_ +#define GPUJIT_H_ + +#include <cuda.h> +#include <cuda_runtime.h> + +void polly_initDevice(CUcontext *Context, CUdevice *Device); +void polly_getPTXModule(void *PTXBuffer, CUmodule *Module); +void polly_getPTXKernelEntry(const char *KernelName, + CUmodule *Module, + CUfunction *Kernel); +void polly_startTimerByCudaEvent(cudaEvent_t *StartTimer, + cudaEvent_t *StopTimer); +void polly_stopTimerByCudaEvent(cudaEvent_t *StartTimer, cudaEvent_t *StopTimer, + float *ElapsedTimes); +void polly_copyFromHostToDevice(CUdeviceptr DevData, void *HostData, + int MemSize); +void polly_copyFromDeviceToHost(void *HostData, CUdeviceptr DevData, + int MemSize); +void polly_allocateMemoryForHostAndDevice(void **PtrHostData, + CUdeviceptr *PtrDevData, + int MemSize); +void polly_setKernelParameters(CUfunction *Kernel, int BlockWidth, + int BlockHeight, CUdeviceptr DevData); +void polly_launchKernel(CUfunction *Kernel, int GridWidth, int GridHeight); +void polly_cleanupGPGPUResources(void *HostData, CUdeviceptr DevData, + CUmodule *Module, CUcontext *Context); +#endif /* GPUJIT_H_ */ diff --git a/polly/tools/GPURuntime/Makefile b/polly/tools/GPURuntime/Makefile new file mode 100644 index 00000000000..4822f70218a --- /dev/null +++ b/polly/tools/GPURuntime/Makefile @@ -0,0 +1,16 @@ +##===- polly/lib/GPURuntime/Makefile -----------------------*- Makefile -*-===## +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## + +LEVEL = ../.. + +LIBRARYNAME = libGPURuntime +LOADABLE_MODULE = 1 + +include $(LEVEL)/Makefile.config +CPP.Flags += $(POLLY_INC) +include $(LEVEL)/Makefile.common diff --git a/polly/tools/Makefile b/polly/tools/Makefile index 2d97a602915..21b6a4290db 100644 --- a/polly/tools/Makefile +++ b/polly/tools/Makefile @@ -10,4 +10,11 @@ LEVEL := .. DIRS := +include $(LEVEL)/Makefile.config + +# GPU Runtime Support +ifeq ($(CUDALIB_FOUND), yes) + DIRS += GPURuntime +endif + include $(LEVEL)/Makefile.common |

