summaryrefslogtreecommitdiffstats
path: root/polly/tools/GPURuntime/GPUJIT.h
blob: 9026ab42f348cd84feebb3852efa1ccae786d9d1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
/******************************************************************************/
/*                                                                            */
/*                     The LLVM Compiler Infrastructure                       */
/*                                                                            */
/* This file is distributed under the University of Illinois Open Source      */
/* License. See LICENSE.TXT for details.                                      */
/*                                                                            */
/******************************************************************************/
/*                                                                            */
/*  This file defines GPUJIT.                                                 */
/*                                                                            */
/******************************************************************************/

#ifndef GPUJIT_H_
#define GPUJIT_H_

/*
 * The following demostrates how we can use the GPURuntime library to
 * execute a GPU kernel.
 *
 * char KernelString[] = "\n\
 *   .version 1.4\n\
 *   .target sm_10, map_f64_to_f32\n\
 *   .entry _Z8myKernelPi (\n\
 *   .param .u64 __cudaparm__Z8myKernelPi_data)\n\
 *   {\n\
 *     .reg .u16 %rh<4>;\n\
 *     .reg .u32 %r<5>;\n\
 *     .reg .u64 %rd<6>;\n\
 *     cvt.u32.u16     %r1, %tid.x;\n\
 *     mov.u16         %rh1, %ctaid.x;\n\
 *     mov.u16         %rh2, %ntid.x;\n\
 *     mul.wide.u16    %r2, %rh1, %rh2;\n\
 *     add.u32         %r3, %r1, %r2;\n\
 *     ld.param.u64    %rd1, [__cudaparm__Z8myKernelPi_data];\n\
 *     cvt.s64.s32     %rd2, %r3;\n\
 *     mul.wide.s32    %rd3, %r3, 4;\n\
 *     add.u64         %rd4, %rd1, %rd3;\n\
 *     st.global.s32   [%rd4+0], %r3;\n\
 *     exit;\n\
 *   }\n\
 * ";
 *
 * const char *Entry = "_Z8myKernelPi";
 *
 * int main() {
 *   PollyGPUContext *Context;
 *   PollyGPUModule *Module;
 *   PollyGPUFunction *Kernel;
 *   PollyGPUDevice *Device;
 *   PollyGPUDevicePtr *PtrDevData;
 *   int *HostData;
 *   PollyGPUEvent *Start;
 *   PollyGPUEvent *Stop;
 *   float *ElapsedTime;
 *   int MemSize;
 *   int BlockWidth = 16;
 *   int BlockHeight = 16;
 *   int GridWidth = 8;
 *   int GridHeight = 8;
 *
 *   MemSize = 256*64*sizeof(int);
 *   polly_initDevice(&Context, &Device);
 *   polly_getPTXModule(KernelString, &Module);
 *   polly_getPTXKernelEntry(Entry, Module, &Kernel);
 *   polly_allocateMemoryForHostAndDevice(&HostData, &DevData, MemSize);
 *   polly_setKernelParameters(Kernel, BlockWidth, BlockHeight, DevData);
 *   polly_startTimerByCudaEvent(&Start, &Stop);
 *   polly_launchKernel(Kernel, GridWidth, GridHeight);
 *   polly_copyFromDeviceToHost(HostData, DevData, MemSize);
 *   polly_stopTimerByCudaEvent(Start, Stop, ElapsedTime);
 *   polly_cleanupGPGPUResources(HostData, DevData, Module, Context, Kernel);
 * }
 *
 */

typedef struct PollyGPUContextT PollyGPUContext;
typedef struct PollyGPUModuleT PollyGPUModule;
typedef struct PollyGPUFunctionT PollyGPUFunction;
typedef struct PollyGPUDeviceT PollyGPUDevice;
typedef struct PollyGPUDevicePtrT PollyGPUDevicePtr;
typedef struct PollyGPUEventT PollyGPUEvent;

void polly_initDevice(PollyGPUContext **Context, PollyGPUDevice **Device);
void polly_getPTXModule(void *PTXBuffer, PollyGPUModule **Module);
void polly_getPTXKernelEntry(const char *KernelName, PollyGPUModule *Module,
                             PollyGPUFunction **Kernel);
void polly_startTimerByCudaEvent(PollyGPUEvent **Start, PollyGPUEvent **Stop);
void polly_stopTimerByCudaEvent(PollyGPUEvent *Start, PollyGPUEvent *Stop,
                                float *ElapsedTimes);
void polly_copyFromHostToDevice(PollyGPUDevicePtr *DevData, void *HostData,
                                int MemSize);
void polly_copyFromDeviceToHost(void *HostData, PollyGPUDevicePtr *DevData,
                                int MemSize);
void polly_allocateMemoryForHostAndDevice(void **HostData,
                                          PollyGPUDevicePtr **DevData,
                                          int MemSize);
void polly_setKernelParameters(PollyGPUFunction *Kernel, int BlockWidth,
                               int BlockHeight, PollyGPUDevicePtr *DevData);
void polly_launchKernel(PollyGPUFunction *Kernel, int GridWidth,
                        int GridHeight);
void polly_cleanupGPGPUResources(void *HostData, PollyGPUDevicePtr *DevData,
                                 PollyGPUModule *Module,
                                 PollyGPUContext *Context,
                                 PollyGPUFunction *Kernel);
#endif /* GPUJIT_H_ */
OpenPOWER on IntegriCloud