summaryrefslogtreecommitdiffstats
path: root/polly/tools
diff options
context:
space:
mode:
Diffstat (limited to 'polly/tools')
-rw-r--r--polly/tools/GPURuntime/GPUJIT.c181
-rw-r--r--polly/tools/GPURuntime/GPUJIT.h103
2 files changed, 216 insertions, 68 deletions
diff --git a/polly/tools/GPURuntime/GPUJIT.c b/polly/tools/GPURuntime/GPUJIT.c
index f5936f2d155..dbe976e0753 100644
--- a/polly/tools/GPURuntime/GPUJIT.c
+++ b/polly/tools/GPURuntime/GPUJIT.c
@@ -12,9 +12,36 @@
/******************************************************************************/
#include "GPUJIT.h"
+#include <cuda.h>
+#include <cuda_runtime.h>
#include <dlfcn.h>
#include <stdio.h>
+/* Define Polly's GPGPU data types. */
+struct PollyGPUContextT {
+ CUcontext Cuda;
+};
+
+struct PollyGPUModuleT {
+ CUmodule Cuda;
+};
+
+struct PollyGPUFunctionT {
+ CUfunction Cuda;
+};
+
+struct PollyGPUDeviceT {
+ CUdevice Cuda;
+};
+
+struct PollyGPUDevicePtrT {
+ CUdeviceptr Cuda;
+};
+
+struct PollyGPUEventT {
+ cudaEvent_t Cuda;
+};
+
/* Dynamic library handles for the CUDA and CUDA runtime library. */
static void *HandleCuda;
static void *HandleCudaRT;
@@ -218,7 +245,7 @@ static int initialDeviceAPIs() {
return 1;
}
-void polly_initDevice(CUcontext *Context, CUdevice *Device) {
+void polly_initDevice(PollyGPUContext **Context, PollyGPUDevice **Device) {
int Major = 0, Minor = 0, DeviceID = 0;
char DeviceName[256];
int DeviceCount = 0;
@@ -242,85 +269,135 @@ void polly_initDevice(CUcontext *Context, CUdevice *Device) {
}
/* We select the 1st device as default. */
- CuDeviceGetFcnPtr(Device, 0);
+ *Device = malloc(sizeof(PollyGPUDevice));
+ if (*Device == 0) {
+ fprintf(stdout, "Allocate memory for Polly GPU device failed.\n");
+ exit(-1);
+ }
+ CuDeviceGetFcnPtr(&((*Device)->Cuda), 0);
/* Get compute capabilities and the device name. */
- CuDeviceComputeCapabilityFcnPtr(&Major, &Minor, *Device);
- CuDeviceGetNameFcnPtr(DeviceName, 256, *Device);
+ CuDeviceComputeCapabilityFcnPtr(&Major, &Minor, (*Device)->Cuda);
+ CuDeviceGetNameFcnPtr(DeviceName, 256, (*Device)->Cuda);
fprintf(stderr, "> Running on GPU device %d : %s.\n", DeviceID, DeviceName);
/* Create context on the device. */
- CuCtxCreateFcnPtr(Context, 0, *Device);
+ *Context = malloc(sizeof(PollyGPUContext));
+ if (*Context == 0) {
+ fprintf(stdout, "Allocate memory for Polly GPU context failed.\n");
+ exit(-1);
+ }
+ CuCtxCreateFcnPtr(&((*Context)->Cuda), 0, (*Device)->Cuda);
}
-void polly_getPTXModule(void *PTXBuffer, CUmodule *Module) {
- if(CuModuleLoadDataExFcnPtr(Module, PTXBuffer, 0, 0, 0) != CUDA_SUCCESS) {
+void polly_getPTXModule(void *PTXBuffer, PollyGPUModule **Module) {
+ *Module = malloc(sizeof(PollyGPUModule));
+ if (*Module == 0) {
+ fprintf(stdout, "Allocate memory for Polly GPU module failed.\n");
+ exit(-1);
+ }
+
+ if (CuModuleLoadDataExFcnPtr(&((*Module)->Cuda), PTXBuffer, 0, 0, 0)
+ != CUDA_SUCCESS) {
fprintf(stdout, "Loading ptx assembly text failed.\n");
exit(-1);
}
}
-void polly_getPTXKernelEntry(const char *KernelName, CUmodule *Module,
- CUfunction *Kernel) {
+void polly_getPTXKernelEntry(const char *KernelName, PollyGPUModule *Module,
+ PollyGPUFunction **Kernel) {
+ *Kernel = malloc(sizeof(PollyGPUFunction));
+ if (*Kernel == 0) {
+ fprintf(stdout, "Allocate memory for Polly GPU kernel failed.\n");
+ exit(-1);
+ }
+
/* Locate the kernel entry point. */
- if(CuModuleGetFunctionFcnPtr(Kernel, *Module, KernelName)
+ if(CuModuleGetFunctionFcnPtr(&((*Kernel)->Cuda), Module->Cuda, KernelName)
!= CUDA_SUCCESS) {
fprintf(stdout, "Loading kernel function failed.\n");
exit(-1);
}
}
-void polly_startTimerByCudaEvent(cudaEvent_t *StartTimer,
- cudaEvent_t *StopTimer) {
- CudaEventCreateFcnPtr(StartTimer);
- CudaEventCreateFcnPtr(StopTimer);
- CudaEventRecordFcnPtr(*StartTimer, 0);
+void polly_startTimerByCudaEvent(PollyGPUEvent **Start, PollyGPUEvent **Stop) {
+ *Start = malloc(sizeof(PollyGPUEvent));
+ if (*Start == 0) {
+ fprintf(stdout, "Allocate memory for Polly GPU start timer failed.\n");
+ exit(-1);
+ }
+ CudaEventCreateFcnPtr(&((*Start)->Cuda));
+
+ *Stop = malloc(sizeof(PollyGPUEvent));
+ if (*Stop == 0) {
+ fprintf(stdout, "Allocate memory for Polly GPU stop timer failed.\n");
+ exit(-1);
+ }
+ CudaEventCreateFcnPtr(&((*Stop)->Cuda));
+
+ /* Record the start time. */
+ CudaEventRecordFcnPtr((*Start)->Cuda, 0);
}
-void polly_stopTimerByCudaEvent(cudaEvent_t *StartTimer,
- cudaEvent_t *StopTimer, float *ElapsedTimes) {
- CudaEventRecordFcnPtr(*StopTimer, 0);
- CudaEventSynchronizeFcnPtr(*StopTimer);
- CudaEventElapsedTimeFcnPtr(ElapsedTimes, *StartTimer, *StopTimer );
- CudaEventDestroyFcnPtr(*StartTimer);
- CudaEventDestroyFcnPtr(*StopTimer);
+void polly_stopTimerByCudaEvent(PollyGPUEvent *Start, PollyGPUEvent *Stop,
+ float *ElapsedTimes) {
+ /* Record the end time. */
+ CudaEventRecordFcnPtr(Stop->Cuda, 0);
+ CudaEventSynchronizeFcnPtr(Start->Cuda);
+ CudaEventSynchronizeFcnPtr(Stop->Cuda);
+ CudaEventElapsedTimeFcnPtr(ElapsedTimes, Start->Cuda, Stop->Cuda);
+ CudaEventDestroyFcnPtr(Start->Cuda);
+ CudaEventDestroyFcnPtr(Stop->Cuda);
fprintf(stderr, "Processing time: %f (ms).\n", *ElapsedTimes);
+
+ free(Start);
+ free(Stop);
}
-void polly_allocateMemoryForHostAndDevice(void **PtrHostData,
- CUdeviceptr *PtrDevData,
+void polly_allocateMemoryForHostAndDevice(void **HostData,
+ PollyGPUDevicePtr **DevData,
int MemSize) {
- if ((*PtrHostData = (int *)malloc(MemSize)) == 0) {
+ if ((*HostData = (int *)malloc(MemSize)) == 0) {
fprintf(stdout, "Could not allocate host memory.\n");
exit(-1);
}
- CuMemAllocFcnPtr(PtrDevData, MemSize);
+
+ *DevData = malloc(sizeof(PollyGPUDevicePtr));
+ if (*DevData == 0) {
+ fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n");
+ exit(-1);
+ }
+ CuMemAllocFcnPtr(&((*DevData)->Cuda), MemSize);
}
-void polly_copyFromHostToDevice(CUdeviceptr DevData, void *HostData,
+void polly_copyFromHostToDevice(PollyGPUDevicePtr *DevData, void *HostData,
int MemSize) {
- CuMemcpyHtoDFcnPtr(DevData, HostData, MemSize);
+ CUdeviceptr CuDevData = DevData->Cuda;
+ CuMemcpyHtoDFcnPtr(CuDevData, HostData, MemSize);
}
-void polly_copyFromDeviceToHost(void *HostData, CUdeviceptr DevData,
+void polly_copyFromDeviceToHost(void *HostData, PollyGPUDevicePtr *DevData,
int MemSize) {
- if(CuMemcpyDtoHFcnPtr(HostData, DevData, MemSize) != CUDA_SUCCESS) {
+ if(CuMemcpyDtoHFcnPtr(HostData, DevData->Cuda, MemSize) != CUDA_SUCCESS) {
fprintf(stdout, "Copying results from device to host memory failed.\n");
exit(-1);
}
}
-void polly_setKernelParameters(CUfunction *Kernel, int BlockWidth,
- int BlockHeight, CUdeviceptr DevData) {
+void polly_setKernelParameters(PollyGPUFunction *Kernel, int BlockWidth,
+ int BlockHeight, PollyGPUDevicePtr *DevData) {
int ParamOffset = 0;
- CuFuncSetBlockShapeFcnPtr(*Kernel, BlockWidth, BlockHeight, 1);
- CuParamSetvFcnPtr(*Kernel, ParamOffset, &DevData, sizeof(DevData));
- ParamOffset += sizeof(DevData);
- CuParamSetSizeFcnPtr(*Kernel, ParamOffset);
+
+ CuFuncSetBlockShapeFcnPtr(Kernel->Cuda, BlockWidth, BlockHeight, 1);
+ CuParamSetvFcnPtr(Kernel->Cuda, ParamOffset, &(DevData->Cuda),
+ sizeof(DevData->Cuda));
+ ParamOffset += sizeof(DevData->Cuda);
+ CuParamSetSizeFcnPtr(Kernel->Cuda, ParamOffset);
}
-void polly_launchKernel(CUfunction *Kernel, int GridWidth, int GridHeight) {
- if (CuLaunchGridFcnPtr(*Kernel, GridWidth, GridHeight) != CUDA_SUCCESS) {
+void polly_launchKernel(PollyGPUFunction *Kernel, int GridWidth,
+ int GridHeight) {
+ if (CuLaunchGridFcnPtr(Kernel->Cuda, GridWidth, GridHeight) != CUDA_SUCCESS) {
fprintf(stdout, "Launching CUDA kernel failed.\n");
exit(-1);
}
@@ -328,26 +405,32 @@ void polly_launchKernel(CUfunction *Kernel, int GridWidth, int GridHeight) {
fprintf(stdout, "CUDA kernel launched.\n");
}
-void polly_cleanupGPGPUResources(void *HostData, CUdeviceptr DevData,
- CUmodule *Module, CUcontext *Context) {
+void polly_cleanupGPGPUResources(void *HostData, PollyGPUDevicePtr *DevData,
+ PollyGPUModule *Module,
+ PollyGPUContext *Context,
+ PollyGPUFunction *Kernel) {
if (HostData) {
free(HostData);
HostData = 0;
}
- if (DevData) {
- CuMemFreeFcnPtr(DevData);
- DevData = 0;
+ if (DevData->Cuda) {
+ CuMemFreeFcnPtr(DevData->Cuda);
+ free(DevData);
+ }
+
+ if (Module->Cuda) {
+ CuModuleUnloadFcnPtr(Module->Cuda);
+ free(Module);
}
- if (*Module) {
- CuModuleUnloadFcnPtr(*Module);
- *Module = 0;
+ if (Context->Cuda) {
+ CuCtxDestroyFcnPtr(Context->Cuda);
+ free(Context);
}
- if (*Context) {
- CuCtxDestroyFcnPtr(*Context);
- *Context = 0;
+ if (Kernel) {
+ free(Kernel);
}
dlclose(HandleCuda);
diff --git a/polly/tools/GPURuntime/GPUJIT.h b/polly/tools/GPURuntime/GPUJIT.h
index 718544faefe..9026ab42f34 100644
--- a/polly/tools/GPURuntime/GPUJIT.h
+++ b/polly/tools/GPURuntime/GPUJIT.h
@@ -14,28 +14,93 @@
#ifndef GPUJIT_H_
#define GPUJIT_H_
-#include <cuda.h>
-#include <cuda_runtime.h>
+/*
+ * The following demostrates how we can use the GPURuntime library to
+ * execute a GPU kernel.
+ *
+ * char KernelString[] = "\n\
+ * .version 1.4\n\
+ * .target sm_10, map_f64_to_f32\n\
+ * .entry _Z8myKernelPi (\n\
+ * .param .u64 __cudaparm__Z8myKernelPi_data)\n\
+ * {\n\
+ * .reg .u16 %rh<4>;\n\
+ * .reg .u32 %r<5>;\n\
+ * .reg .u64 %rd<6>;\n\
+ * cvt.u32.u16 %r1, %tid.x;\n\
+ * mov.u16 %rh1, %ctaid.x;\n\
+ * mov.u16 %rh2, %ntid.x;\n\
+ * mul.wide.u16 %r2, %rh1, %rh2;\n\
+ * add.u32 %r3, %r1, %r2;\n\
+ * ld.param.u64 %rd1, [__cudaparm__Z8myKernelPi_data];\n\
+ * cvt.s64.s32 %rd2, %r3;\n\
+ * mul.wide.s32 %rd3, %r3, 4;\n\
+ * add.u64 %rd4, %rd1, %rd3;\n\
+ * st.global.s32 [%rd4+0], %r3;\n\
+ * exit;\n\
+ * }\n\
+ * ";
+ *
+ * const char *Entry = "_Z8myKernelPi";
+ *
+ * int main() {
+ * PollyGPUContext *Context;
+ * PollyGPUModule *Module;
+ * PollyGPUFunction *Kernel;
+ * PollyGPUDevice *Device;
+ * PollyGPUDevicePtr *PtrDevData;
+ * int *HostData;
+ * PollyGPUEvent *Start;
+ * PollyGPUEvent *Stop;
+ * float *ElapsedTime;
+ * int MemSize;
+ * int BlockWidth = 16;
+ * int BlockHeight = 16;
+ * int GridWidth = 8;
+ * int GridHeight = 8;
+ *
+ * MemSize = 256*64*sizeof(int);
+ * polly_initDevice(&Context, &Device);
+ * polly_getPTXModule(KernelString, &Module);
+ * polly_getPTXKernelEntry(Entry, Module, &Kernel);
+ * polly_allocateMemoryForHostAndDevice(&HostData, &DevData, MemSize);
+ * polly_setKernelParameters(Kernel, BlockWidth, BlockHeight, DevData);
+ * polly_startTimerByCudaEvent(&Start, &Stop);
+ * polly_launchKernel(Kernel, GridWidth, GridHeight);
+ * polly_copyFromDeviceToHost(HostData, DevData, MemSize);
+ * polly_stopTimerByCudaEvent(Start, Stop, ElapsedTime);
+ * polly_cleanupGPGPUResources(HostData, DevData, Module, Context, Kernel);
+ * }
+ *
+ */
-void polly_initDevice(CUcontext *Context, CUdevice *Device);
-void polly_getPTXModule(void *PTXBuffer, CUmodule *Module);
-void polly_getPTXKernelEntry(const char *KernelName,
- CUmodule *Module,
- CUfunction *Kernel);
-void polly_startTimerByCudaEvent(cudaEvent_t *StartTimer,
- cudaEvent_t *StopTimer);
-void polly_stopTimerByCudaEvent(cudaEvent_t *StartTimer, cudaEvent_t *StopTimer,
+typedef struct PollyGPUContextT PollyGPUContext;
+typedef struct PollyGPUModuleT PollyGPUModule;
+typedef struct PollyGPUFunctionT PollyGPUFunction;
+typedef struct PollyGPUDeviceT PollyGPUDevice;
+typedef struct PollyGPUDevicePtrT PollyGPUDevicePtr;
+typedef struct PollyGPUEventT PollyGPUEvent;
+
+void polly_initDevice(PollyGPUContext **Context, PollyGPUDevice **Device);
+void polly_getPTXModule(void *PTXBuffer, PollyGPUModule **Module);
+void polly_getPTXKernelEntry(const char *KernelName, PollyGPUModule *Module,
+ PollyGPUFunction **Kernel);
+void polly_startTimerByCudaEvent(PollyGPUEvent **Start, PollyGPUEvent **Stop);
+void polly_stopTimerByCudaEvent(PollyGPUEvent *Start, PollyGPUEvent *Stop,
float *ElapsedTimes);
-void polly_copyFromHostToDevice(CUdeviceptr DevData, void *HostData,
+void polly_copyFromHostToDevice(PollyGPUDevicePtr *DevData, void *HostData,
int MemSize);
-void polly_copyFromDeviceToHost(void *HostData, CUdeviceptr DevData,
+void polly_copyFromDeviceToHost(void *HostData, PollyGPUDevicePtr *DevData,
int MemSize);
-void polly_allocateMemoryForHostAndDevice(void **PtrHostData,
- CUdeviceptr *PtrDevData,
+void polly_allocateMemoryForHostAndDevice(void **HostData,
+ PollyGPUDevicePtr **DevData,
int MemSize);
-void polly_setKernelParameters(CUfunction *Kernel, int BlockWidth,
- int BlockHeight, CUdeviceptr DevData);
-void polly_launchKernel(CUfunction *Kernel, int GridWidth, int GridHeight);
-void polly_cleanupGPGPUResources(void *HostData, CUdeviceptr DevData,
- CUmodule *Module, CUcontext *Context);
+void polly_setKernelParameters(PollyGPUFunction *Kernel, int BlockWidth,
+ int BlockHeight, PollyGPUDevicePtr *DevData);
+void polly_launchKernel(PollyGPUFunction *Kernel, int GridWidth,
+ int GridHeight);
+void polly_cleanupGPGPUResources(void *HostData, PollyGPUDevicePtr *DevData,
+ PollyGPUModule *Module,
+ PollyGPUContext *Context,
+ PollyGPUFunction *Kernel);
#endif /* GPUJIT_H_ */
OpenPOWER on IntegriCloud