diff options
author | Tobias Grosser <grosser@fim.uni-passau.de> | 2012-07-04 21:45:03 +0000 |
---|---|---|
committer | Tobias Grosser <grosser@fim.uni-passau.de> | 2012-07-04 21:45:03 +0000 |
commit | 5c0f6f3350ed1df489b1c4f8d5328b414eafc46f (patch) | |
tree | 3d43387db009d8305508e3d4ae3f85046aa303b2 | |
parent | eb7b9f824843af2c185139ab5ad3c734ad37997b (diff) | |
download | bcm5719-llvm-5c0f6f3350ed1df489b1c4f8d5328b414eafc46f.tar.gz bcm5719-llvm-5c0f6f3350ed1df489b1c4f8d5328b414eafc46f.zip |
Replace CUDA data types with Polly's GPGPU data types.
Contributed by: Yabin Hu <yabin.hwu@gmail.com>
llvm-svn: 159725
-rw-r--r-- | polly/CMakeLists.txt | 3 | ||||
-rw-r--r-- | polly/cmake/FindCUDA.cmake | 6 | ||||
-rw-r--r-- | polly/tools/GPURuntime/GPUJIT.c | 181 | ||||
-rw-r--r-- | polly/tools/GPURuntime/GPUJIT.h | 103 |
4 files changed, 219 insertions, 74 deletions
diff --git a/polly/CMakeLists.txt b/polly/CMakeLists.txt index f385d37cec6..368c2ed3e32 100644 --- a/polly/CMakeLists.txt +++ b/polly/CMakeLists.txt @@ -144,9 +144,6 @@ macro(add_polly_library name) if (SCOPLIB_FOUND) target_link_libraries( ${name} ${SCOPLIB_LIBRARY}) endif(SCOPLIB_FOUND) - if (CUDALIB_FOUND) - target_link_libraries( ${name} ${CUDALIB_LIBRARY}) - endif(CUDALIB_FOUND) if( LLVM_LINK_COMPONENTS ) llvm_config(${name} ${LLVM_LINK_COMPONENTS}) diff --git a/polly/cmake/FindCUDA.cmake b/polly/cmake/FindCUDA.cmake index 9683bc4fd54..37592f9efc5 100644 --- a/polly/cmake/FindCUDA.cmake +++ b/polly/cmake/FindCUDA.cmake @@ -8,13 +8,13 @@ FIND_PATH(CUDALIB_INCLUDE_DIR FIND_LIBRARY(CUDALIB_LIBRARY NAMES cuda) -IF (CUDALIB_INCLUDE_DIR AND CUDALIB_LIBRARY) +IF (CUDALIB_INCLUDE_DIR) SET(CUDALIB_FOUND TRUE) -ENDIF (CUDALIB_INCLUDE_DIR AND CUDALIB_LIBRARY) +ENDIF (CUDALIB_INCLUDE_DIR) IF (CUDALIB_FOUND) IF (NOT CUDA_FIND_QUIETLY) - MESSAGE(STATUS "Found CUDA: ${CUDALIB_LIBRARY}") + MESSAGE(STATUS "Found CUDA: ${CUDALIB_INCLUDE_DIR}") ENDIF (NOT CUDA_FIND_QUIETLY) ELSE (CUDALIB_FOUND) IF (CUDA_FIND_REQUIRED) diff --git a/polly/tools/GPURuntime/GPUJIT.c b/polly/tools/GPURuntime/GPUJIT.c index f5936f2d155..dbe976e0753 100644 --- a/polly/tools/GPURuntime/GPUJIT.c +++ b/polly/tools/GPURuntime/GPUJIT.c @@ -12,9 +12,36 @@ /******************************************************************************/ #include "GPUJIT.h" +#include <cuda.h> +#include <cuda_runtime.h> #include <dlfcn.h> #include <stdio.h> +/* Define Polly's GPGPU data types. */ +struct PollyGPUContextT { + CUcontext Cuda; +}; + +struct PollyGPUModuleT { + CUmodule Cuda; +}; + +struct PollyGPUFunctionT { + CUfunction Cuda; +}; + +struct PollyGPUDeviceT { + CUdevice Cuda; +}; + +struct PollyGPUDevicePtrT { + CUdeviceptr Cuda; +}; + +struct PollyGPUEventT { + cudaEvent_t Cuda; +}; + /* Dynamic library handles for the CUDA and CUDA runtime library. */ static void *HandleCuda; static void *HandleCudaRT; @@ -218,7 +245,7 @@ static int initialDeviceAPIs() { return 1; } -void polly_initDevice(CUcontext *Context, CUdevice *Device) { +void polly_initDevice(PollyGPUContext **Context, PollyGPUDevice **Device) { int Major = 0, Minor = 0, DeviceID = 0; char DeviceName[256]; int DeviceCount = 0; @@ -242,85 +269,135 @@ void polly_initDevice(CUcontext *Context, CUdevice *Device) { } /* We select the 1st device as default. */ - CuDeviceGetFcnPtr(Device, 0); + *Device = malloc(sizeof(PollyGPUDevice)); + if (*Device == 0) { + fprintf(stdout, "Allocate memory for Polly GPU device failed.\n"); + exit(-1); + } + CuDeviceGetFcnPtr(&((*Device)->Cuda), 0); /* Get compute capabilities and the device name. */ - CuDeviceComputeCapabilityFcnPtr(&Major, &Minor, *Device); - CuDeviceGetNameFcnPtr(DeviceName, 256, *Device); + CuDeviceComputeCapabilityFcnPtr(&Major, &Minor, (*Device)->Cuda); + CuDeviceGetNameFcnPtr(DeviceName, 256, (*Device)->Cuda); fprintf(stderr, "> Running on GPU device %d : %s.\n", DeviceID, DeviceName); /* Create context on the device. */ - CuCtxCreateFcnPtr(Context, 0, *Device); + *Context = malloc(sizeof(PollyGPUContext)); + if (*Context == 0) { + fprintf(stdout, "Allocate memory for Polly GPU context failed.\n"); + exit(-1); + } + CuCtxCreateFcnPtr(&((*Context)->Cuda), 0, (*Device)->Cuda); } -void polly_getPTXModule(void *PTXBuffer, CUmodule *Module) { - if(CuModuleLoadDataExFcnPtr(Module, PTXBuffer, 0, 0, 0) != CUDA_SUCCESS) { +void polly_getPTXModule(void *PTXBuffer, PollyGPUModule **Module) { + *Module = malloc(sizeof(PollyGPUModule)); + if (*Module == 0) { + fprintf(stdout, "Allocate memory for Polly GPU module failed.\n"); + exit(-1); + } + + if (CuModuleLoadDataExFcnPtr(&((*Module)->Cuda), PTXBuffer, 0, 0, 0) + != CUDA_SUCCESS) { fprintf(stdout, "Loading ptx assembly text failed.\n"); exit(-1); } } -void polly_getPTXKernelEntry(const char *KernelName, CUmodule *Module, - CUfunction *Kernel) { +void polly_getPTXKernelEntry(const char *KernelName, PollyGPUModule *Module, + PollyGPUFunction **Kernel) { + *Kernel = malloc(sizeof(PollyGPUFunction)); + if (*Kernel == 0) { + fprintf(stdout, "Allocate memory for Polly GPU kernel failed.\n"); + exit(-1); + } + /* Locate the kernel entry point. */ - if(CuModuleGetFunctionFcnPtr(Kernel, *Module, KernelName) + if(CuModuleGetFunctionFcnPtr(&((*Kernel)->Cuda), Module->Cuda, KernelName) != CUDA_SUCCESS) { fprintf(stdout, "Loading kernel function failed.\n"); exit(-1); } } -void polly_startTimerByCudaEvent(cudaEvent_t *StartTimer, - cudaEvent_t *StopTimer) { - CudaEventCreateFcnPtr(StartTimer); - CudaEventCreateFcnPtr(StopTimer); - CudaEventRecordFcnPtr(*StartTimer, 0); +void polly_startTimerByCudaEvent(PollyGPUEvent **Start, PollyGPUEvent **Stop) { + *Start = malloc(sizeof(PollyGPUEvent)); + if (*Start == 0) { + fprintf(stdout, "Allocate memory for Polly GPU start timer failed.\n"); + exit(-1); + } + CudaEventCreateFcnPtr(&((*Start)->Cuda)); + + *Stop = malloc(sizeof(PollyGPUEvent)); + if (*Stop == 0) { + fprintf(stdout, "Allocate memory for Polly GPU stop timer failed.\n"); + exit(-1); + } + CudaEventCreateFcnPtr(&((*Stop)->Cuda)); + + /* Record the start time. */ + CudaEventRecordFcnPtr((*Start)->Cuda, 0); } -void polly_stopTimerByCudaEvent(cudaEvent_t *StartTimer, - cudaEvent_t *StopTimer, float *ElapsedTimes) { - CudaEventRecordFcnPtr(*StopTimer, 0); - CudaEventSynchronizeFcnPtr(*StopTimer); - CudaEventElapsedTimeFcnPtr(ElapsedTimes, *StartTimer, *StopTimer ); - CudaEventDestroyFcnPtr(*StartTimer); - CudaEventDestroyFcnPtr(*StopTimer); +void polly_stopTimerByCudaEvent(PollyGPUEvent *Start, PollyGPUEvent *Stop, + float *ElapsedTimes) { + /* Record the end time. */ + CudaEventRecordFcnPtr(Stop->Cuda, 0); + CudaEventSynchronizeFcnPtr(Start->Cuda); + CudaEventSynchronizeFcnPtr(Stop->Cuda); + CudaEventElapsedTimeFcnPtr(ElapsedTimes, Start->Cuda, Stop->Cuda); + CudaEventDestroyFcnPtr(Start->Cuda); + CudaEventDestroyFcnPtr(Stop->Cuda); fprintf(stderr, "Processing time: %f (ms).\n", *ElapsedTimes); + + free(Start); + free(Stop); } -void polly_allocateMemoryForHostAndDevice(void **PtrHostData, - CUdeviceptr *PtrDevData, +void polly_allocateMemoryForHostAndDevice(void **HostData, + PollyGPUDevicePtr **DevData, int MemSize) { - if ((*PtrHostData = (int *)malloc(MemSize)) == 0) { + if ((*HostData = (int *)malloc(MemSize)) == 0) { fprintf(stdout, "Could not allocate host memory.\n"); exit(-1); } - CuMemAllocFcnPtr(PtrDevData, MemSize); + + *DevData = malloc(sizeof(PollyGPUDevicePtr)); + if (*DevData == 0) { + fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n"); + exit(-1); + } + CuMemAllocFcnPtr(&((*DevData)->Cuda), MemSize); } -void polly_copyFromHostToDevice(CUdeviceptr DevData, void *HostData, +void polly_copyFromHostToDevice(PollyGPUDevicePtr *DevData, void *HostData, int MemSize) { - CuMemcpyHtoDFcnPtr(DevData, HostData, MemSize); + CUdeviceptr CuDevData = DevData->Cuda; + CuMemcpyHtoDFcnPtr(CuDevData, HostData, MemSize); } -void polly_copyFromDeviceToHost(void *HostData, CUdeviceptr DevData, +void polly_copyFromDeviceToHost(void *HostData, PollyGPUDevicePtr *DevData, int MemSize) { - if(CuMemcpyDtoHFcnPtr(HostData, DevData, MemSize) != CUDA_SUCCESS) { + if(CuMemcpyDtoHFcnPtr(HostData, DevData->Cuda, MemSize) != CUDA_SUCCESS) { fprintf(stdout, "Copying results from device to host memory failed.\n"); exit(-1); } } -void polly_setKernelParameters(CUfunction *Kernel, int BlockWidth, - int BlockHeight, CUdeviceptr DevData) { +void polly_setKernelParameters(PollyGPUFunction *Kernel, int BlockWidth, + int BlockHeight, PollyGPUDevicePtr *DevData) { int ParamOffset = 0; - CuFuncSetBlockShapeFcnPtr(*Kernel, BlockWidth, BlockHeight, 1); - CuParamSetvFcnPtr(*Kernel, ParamOffset, &DevData, sizeof(DevData)); - ParamOffset += sizeof(DevData); - CuParamSetSizeFcnPtr(*Kernel, ParamOffset); + + CuFuncSetBlockShapeFcnPtr(Kernel->Cuda, BlockWidth, BlockHeight, 1); + CuParamSetvFcnPtr(Kernel->Cuda, ParamOffset, &(DevData->Cuda), + sizeof(DevData->Cuda)); + ParamOffset += sizeof(DevData->Cuda); + CuParamSetSizeFcnPtr(Kernel->Cuda, ParamOffset); } -void polly_launchKernel(CUfunction *Kernel, int GridWidth, int GridHeight) { - if (CuLaunchGridFcnPtr(*Kernel, GridWidth, GridHeight) != CUDA_SUCCESS) { +void polly_launchKernel(PollyGPUFunction *Kernel, int GridWidth, + int GridHeight) { + if (CuLaunchGridFcnPtr(Kernel->Cuda, GridWidth, GridHeight) != CUDA_SUCCESS) { fprintf(stdout, "Launching CUDA kernel failed.\n"); exit(-1); } @@ -328,26 +405,32 @@ void polly_launchKernel(CUfunction *Kernel, int GridWidth, int GridHeight) { fprintf(stdout, "CUDA kernel launched.\n"); } -void polly_cleanupGPGPUResources(void *HostData, CUdeviceptr DevData, - CUmodule *Module, CUcontext *Context) { +void polly_cleanupGPGPUResources(void *HostData, PollyGPUDevicePtr *DevData, + PollyGPUModule *Module, + PollyGPUContext *Context, + PollyGPUFunction *Kernel) { if (HostData) { free(HostData); HostData = 0; } - if (DevData) { - CuMemFreeFcnPtr(DevData); - DevData = 0; + if (DevData->Cuda) { + CuMemFreeFcnPtr(DevData->Cuda); + free(DevData); + } + + if (Module->Cuda) { + CuModuleUnloadFcnPtr(Module->Cuda); + free(Module); } - if (*Module) { - CuModuleUnloadFcnPtr(*Module); - *Module = 0; + if (Context->Cuda) { + CuCtxDestroyFcnPtr(Context->Cuda); + free(Context); } - if (*Context) { - CuCtxDestroyFcnPtr(*Context); - *Context = 0; + if (Kernel) { + free(Kernel); } dlclose(HandleCuda); diff --git a/polly/tools/GPURuntime/GPUJIT.h b/polly/tools/GPURuntime/GPUJIT.h index 718544faefe..9026ab42f34 100644 --- a/polly/tools/GPURuntime/GPUJIT.h +++ b/polly/tools/GPURuntime/GPUJIT.h @@ -14,28 +14,93 @@ #ifndef GPUJIT_H_ #define GPUJIT_H_ -#include <cuda.h> -#include <cuda_runtime.h> +/* + * The following demostrates how we can use the GPURuntime library to + * execute a GPU kernel. + * + * char KernelString[] = "\n\ + * .version 1.4\n\ + * .target sm_10, map_f64_to_f32\n\ + * .entry _Z8myKernelPi (\n\ + * .param .u64 __cudaparm__Z8myKernelPi_data)\n\ + * {\n\ + * .reg .u16 %rh<4>;\n\ + * .reg .u32 %r<5>;\n\ + * .reg .u64 %rd<6>;\n\ + * cvt.u32.u16 %r1, %tid.x;\n\ + * mov.u16 %rh1, %ctaid.x;\n\ + * mov.u16 %rh2, %ntid.x;\n\ + * mul.wide.u16 %r2, %rh1, %rh2;\n\ + * add.u32 %r3, %r1, %r2;\n\ + * ld.param.u64 %rd1, [__cudaparm__Z8myKernelPi_data];\n\ + * cvt.s64.s32 %rd2, %r3;\n\ + * mul.wide.s32 %rd3, %r3, 4;\n\ + * add.u64 %rd4, %rd1, %rd3;\n\ + * st.global.s32 [%rd4+0], %r3;\n\ + * exit;\n\ + * }\n\ + * "; + * + * const char *Entry = "_Z8myKernelPi"; + * + * int main() { + * PollyGPUContext *Context; + * PollyGPUModule *Module; + * PollyGPUFunction *Kernel; + * PollyGPUDevice *Device; + * PollyGPUDevicePtr *PtrDevData; + * int *HostData; + * PollyGPUEvent *Start; + * PollyGPUEvent *Stop; + * float *ElapsedTime; + * int MemSize; + * int BlockWidth = 16; + * int BlockHeight = 16; + * int GridWidth = 8; + * int GridHeight = 8; + * + * MemSize = 256*64*sizeof(int); + * polly_initDevice(&Context, &Device); + * polly_getPTXModule(KernelString, &Module); + * polly_getPTXKernelEntry(Entry, Module, &Kernel); + * polly_allocateMemoryForHostAndDevice(&HostData, &DevData, MemSize); + * polly_setKernelParameters(Kernel, BlockWidth, BlockHeight, DevData); + * polly_startTimerByCudaEvent(&Start, &Stop); + * polly_launchKernel(Kernel, GridWidth, GridHeight); + * polly_copyFromDeviceToHost(HostData, DevData, MemSize); + * polly_stopTimerByCudaEvent(Start, Stop, ElapsedTime); + * polly_cleanupGPGPUResources(HostData, DevData, Module, Context, Kernel); + * } + * + */ -void polly_initDevice(CUcontext *Context, CUdevice *Device); -void polly_getPTXModule(void *PTXBuffer, CUmodule *Module); -void polly_getPTXKernelEntry(const char *KernelName, - CUmodule *Module, - CUfunction *Kernel); -void polly_startTimerByCudaEvent(cudaEvent_t *StartTimer, - cudaEvent_t *StopTimer); -void polly_stopTimerByCudaEvent(cudaEvent_t *StartTimer, cudaEvent_t *StopTimer, +typedef struct PollyGPUContextT PollyGPUContext; +typedef struct PollyGPUModuleT PollyGPUModule; +typedef struct PollyGPUFunctionT PollyGPUFunction; +typedef struct PollyGPUDeviceT PollyGPUDevice; +typedef struct PollyGPUDevicePtrT PollyGPUDevicePtr; +typedef struct PollyGPUEventT PollyGPUEvent; + +void polly_initDevice(PollyGPUContext **Context, PollyGPUDevice **Device); +void polly_getPTXModule(void *PTXBuffer, PollyGPUModule **Module); +void polly_getPTXKernelEntry(const char *KernelName, PollyGPUModule *Module, + PollyGPUFunction **Kernel); +void polly_startTimerByCudaEvent(PollyGPUEvent **Start, PollyGPUEvent **Stop); +void polly_stopTimerByCudaEvent(PollyGPUEvent *Start, PollyGPUEvent *Stop, float *ElapsedTimes); -void polly_copyFromHostToDevice(CUdeviceptr DevData, void *HostData, +void polly_copyFromHostToDevice(PollyGPUDevicePtr *DevData, void *HostData, int MemSize); -void polly_copyFromDeviceToHost(void *HostData, CUdeviceptr DevData, +void polly_copyFromDeviceToHost(void *HostData, PollyGPUDevicePtr *DevData, int MemSize); -void polly_allocateMemoryForHostAndDevice(void **PtrHostData, - CUdeviceptr *PtrDevData, +void polly_allocateMemoryForHostAndDevice(void **HostData, + PollyGPUDevicePtr **DevData, int MemSize); -void polly_setKernelParameters(CUfunction *Kernel, int BlockWidth, - int BlockHeight, CUdeviceptr DevData); -void polly_launchKernel(CUfunction *Kernel, int GridWidth, int GridHeight); -void polly_cleanupGPGPUResources(void *HostData, CUdeviceptr DevData, - CUmodule *Module, CUcontext *Context); +void polly_setKernelParameters(PollyGPUFunction *Kernel, int BlockWidth, + int BlockHeight, PollyGPUDevicePtr *DevData); +void polly_launchKernel(PollyGPUFunction *Kernel, int GridWidth, + int GridHeight); +void polly_cleanupGPGPUResources(void *HostData, PollyGPUDevicePtr *DevData, + PollyGPUModule *Module, + PollyGPUContext *Context, + PollyGPUFunction *Kernel); #endif /* GPUJIT_H_ */ |