diff options
| author | Tobias Grosser <grosser@fim.uni-passau.de> | 2012-08-03 12:50:07 +0000 |
|---|---|---|
| committer | Tobias Grosser <grosser@fim.uni-passau.de> | 2012-08-03 12:50:07 +0000 |
| commit | 6217e18a7dd50c9f112f8468f7c77fc260f40d89 (patch) | |
| tree | cbf2767ae63ffb99467b07bc3edf7ec7d461aff0 /polly/test/CodeGen | |
| parent | 7555b54020b1abcc1e67f66452600e2fa8f93dcd (diff) | |
| download | bcm5719-llvm-6217e18a7dd50c9f112f8468f7c77fc260f40d89.tar.gz bcm5719-llvm-6217e18a7dd50c9f112f8468f7c77fc260f40d89.zip | |
Add preliminary implementation for GPGPU code generation.
Translate the selected parallel loop body into a ptx string and run it with the
cuda driver API. We limit this preliminary implementation to target the
following special test cases:
- Support only 2-dimensional parallel loops with or without only one innermost
non-parallel loop.
- Support write memory access to only one array in a SCoP.
The patch was committed with smaller changes to the build system:
There is now a flag to enable gpu code generation explictly. This was required
as we need the llvm.codegen() patch applied on the llvm sources, to compile this
feature correctly. Also, enabling gpu code generation does not require cuda.
This requirement was removed to allow 'make polly-test' runs, even without an
installed cuda runtime.
Contributed by: Yabin Hu <yabin.hwu@gmail.com>
llvm-svn: 161239
Diffstat (limited to 'polly/test/CodeGen')
9 files changed, 267 insertions, 0 deletions
diff --git a/polly/test/CodeGen/GPGPU/2d_innermost_parallel.c b/polly/test/CodeGen/GPGPU/2d_innermost_parallel.c new file mode 100644 index 00000000000..b6397d19429 --- /dev/null +++ b/polly/test/CodeGen/GPGPU/2d_innermost_parallel.c @@ -0,0 +1,16 @@ +int A[128][128]; + +int gpu_pure() { + int i,j; + + for(i = 0; i < 128; i++) + for(j = 0; j < 128; j++) + A[i][j] = i*128 + j; + + return 0; +} + +int main() { + int b = gpu_pure(); + return 0; +} diff --git a/polly/test/CodeGen/GPGPU/2d_innermost_parallel.ll b/polly/test/CodeGen/GPGPU/2d_innermost_parallel.ll new file mode 100644 index 00000000000..820280367ab --- /dev/null +++ b/polly/test/CodeGen/GPGPU/2d_innermost_parallel.ll @@ -0,0 +1,65 @@ +; RUN: opt %loadPolly -basicaa -polly-import-jscop -polly-import-jscop-dir=%S -polly-import-jscop-postfix=transformed+gpu -enable-polly-gpgpu -polly-gpgpu-triple=nvptx64-unknown-unknown -polly-codegen %s -S | FileCheck %s +; ModuleID = '2d_innermost_parallel.s' +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = common global [128 x [128 x i32]] zeroinitializer, align 16 + +define i32 @gpu_pure() nounwind uwtable { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc6, %entry + %indvars.iv2 = phi i64 [ %indvars.iv.next3, %for.inc6 ], [ 0, %entry ] + %lftr.wideiv5 = trunc i64 %indvars.iv2 to i32 + %exitcond6 = icmp ne i32 %lftr.wideiv5, 128 + br i1 %exitcond6, label %for.body, label %for.end8 + +for.body: ; preds = %for.cond + br label %for.cond1 + +for.cond1: ; preds = %for.inc, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.body ] + %lftr.wideiv = trunc i64 %indvars.iv to i32 + %exitcond = icmp ne i32 %lftr.wideiv, 128 + br i1 %exitcond, label %for.body3, label %for.end + +for.body3: ; preds = %for.cond1 + %tmp = shl nsw i64 %indvars.iv2, 7 + %tmp7 = add nsw i64 %tmp, %indvars.iv + %arrayidx5 = getelementptr inbounds [128 x [128 x i32]]* @A, i64 0, i64 %indvars.iv2, i64 %indvars.iv + %tmp8 = trunc i64 %tmp7 to i32 + store i32 %tmp8, i32* %arrayidx5, align 4 + br label %for.inc + +for.inc: ; preds = %for.body3 + %indvars.iv.next = add i64 %indvars.iv, 1 + br label %for.cond1 + +for.end: ; preds = %for.cond1 + br label %for.inc6 + +for.inc6: ; preds = %for.end + %indvars.iv.next3 = add i64 %indvars.iv2, 1 + br label %for.cond + +for.end8: ; preds = %for.cond + ret i32 0 +} + +define i32 @main() nounwind uwtable { +entry: + %call = call i32 @gpu_pure() + ret i32 0 +} + +; CHECK: call void @polly_initDevice +; CHECK: call void @polly_getPTXModule +; CHECK: call void @polly_getPTXKernelEntry +; CHECK: call void @polly_allocateMemoryForHostAndDevice +; CHECK: call void @polly_setKernelParameters +; CHECK: call void @polly_startTimerByCudaEvent +; CHECK: call void @polly_launchKernel +; CHECK: call void @polly_copyFromDeviceToHost +; CHECK: call void @polly_stopTimerByCudaEvent +; CHECK: call void @polly_cleanupGPGPUResources diff --git a/polly/test/CodeGen/GPGPU/3d_innermost_non_parallel.c b/polly/test/CodeGen/GPGPU/3d_innermost_non_parallel.c new file mode 100644 index 00000000000..dae115ea3b4 --- /dev/null +++ b/polly/test/CodeGen/GPGPU/3d_innermost_non_parallel.c @@ -0,0 +1,17 @@ +int A[128][128]; + +int gpu_no_pure() { + int i,j,k; + + for(i = 0; i < 128; i++) + for(j = 0; j < 128; j++) + for(k = 0; k < 256; k++) + A[i][j] += i*123/(k+1)+5-j*k-123; + + return 0; +} + +int main() { + int b = gpu_no_pure(); + return 0; +} diff --git a/polly/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll b/polly/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll new file mode 100644 index 00000000000..588f581b2ef --- /dev/null +++ b/polly/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll @@ -0,0 +1,88 @@ +; RUN: opt %loadPolly -basicaa -polly-import-jscop -polly-import-jscop-dir=%S -polly-import-jscop-postfix=transformed+gpu -enable-polly-gpgpu -polly-gpgpu-triple=nvptx64-unknown-unknown -polly-codegen %s -S | FileCheck %s +; ModuleID = '3d_innermost_non_parallel.s' +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = common global [128 x [128 x i32]] zeroinitializer, align 16 + +define i32 @gpu_no_pure() nounwind uwtable { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc16, %entry + %indvars.iv2 = phi i64 [ %indvars.iv.next3, %for.inc16 ], [ 0, %entry ] + %lftr.wideiv5 = trunc i64 %indvars.iv2 to i32 + %exitcond6 = icmp ne i32 %lftr.wideiv5, 128 + br i1 %exitcond6, label %for.body, label %for.end18 + +for.body: ; preds = %for.cond + br label %for.cond1 + +for.cond1: ; preds = %for.inc13, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc13 ], [ 0, %for.body ] + %lftr.wideiv = trunc i64 %indvars.iv to i32 + %exitcond1 = icmp ne i32 %lftr.wideiv, 128 + br i1 %exitcond1, label %for.body3, label %for.end15 + +for.body3: ; preds = %for.cond1 + br label %for.cond4 + +for.cond4: ; preds = %for.inc, %for.body3 + %k.0 = phi i32 [ 0, %for.body3 ], [ %inc, %for.inc ] + %exitcond = icmp ne i32 %k.0, 256 + br i1 %exitcond, label %for.body6, label %for.end + +for.body6: ; preds = %for.cond4 + %tmp = mul nsw i64 %indvars.iv2, 123 + %add = add nsw i32 %k.0, 1 + %tmp7 = trunc i64 %tmp to i32 + %div = sdiv i32 %tmp7, %add + %add7 = add nsw i32 %div, 5 + %tmp8 = trunc i64 %indvars.iv to i32 + %mul8 = mul nsw i32 %tmp8, %k.0 + %sub = sub nsw i32 %add7, %mul8 + %sub9 = add nsw i32 %sub, -123 + %arrayidx11 = getelementptr inbounds [128 x [128 x i32]]* @A, i64 0, i64 %indvars.iv2, i64 %indvars.iv + %tmp9 = load i32* %arrayidx11, align 4 + %add12 = add nsw i32 %tmp9, %sub9 + store i32 %add12, i32* %arrayidx11, align 4 + br label %for.inc + +for.inc: ; preds = %for.body6 + %inc = add nsw i32 %k.0, 1 + br label %for.cond4 + +for.end: ; preds = %for.cond4 + br label %for.inc13 + +for.inc13: ; preds = %for.end + %indvars.iv.next = add i64 %indvars.iv, 1 + br label %for.cond1 + +for.end15: ; preds = %for.cond1 + br label %for.inc16 + +for.inc16: ; preds = %for.end15 + %indvars.iv.next3 = add i64 %indvars.iv2, 1 + br label %for.cond + +for.end18: ; preds = %for.cond + ret i32 0 +} + +define i32 @main() nounwind uwtable { +entry: + %call = call i32 @gpu_no_pure() + ret i32 0 +} + +; CHECK: call void @polly_initDevice +; CHECK: call void @polly_getPTXModule +; CHECK: call void @polly_getPTXKernelEntry +; CHECK: call void @polly_allocateMemoryForHostAndDevice +; CHECK: call void @polly_setKernelParameters +; CHECK: call void @polly_startTimerByCudaEvent +; CHECK: call void @polly_launchKernel +; CHECK: call void @polly_copyFromDeviceToHost +; CHECK: call void @polly_stopTimerByCudaEvent +; CHECK: call void @polly_cleanupGPGPUResources diff --git a/polly/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop b/polly/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop new file mode 100644 index 00000000000..0d7a260a700 --- /dev/null +++ b/polly/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop @@ -0,0 +1,21 @@ +{ + "context" : "{ : }", + "name" : "for.cond => for.end18", + "statements" : [ + { + "accesses" : [ + { + "kind" : "read", + "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }" + }, + { + "kind" : "write", + "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }" + } + ], + "domain" : "{ Stmt_for_body6[i0, i1, i2] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 and i2 >= 0 and i2 <= 255 }", + "name" : "Stmt_for_body6", + "schedule" : "{ Stmt_for_body6[i0, i1, i2] -> scattering[0, i0, 0, i1, 0, i2, 0] }" + } + ] +} diff --git a/polly/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop.transformed+gpu b/polly/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop.transformed+gpu new file mode 100644 index 00000000000..6f007fa7864 --- /dev/null +++ b/polly/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop.transformed+gpu @@ -0,0 +1,21 @@ +{ + "context" : "{ : }", + "name" : "for.cond => for.end18", + "statements" : [ + { + "accesses" : [ + { + "kind" : "read", + "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }" + }, + { + "kind" : "write", + "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }" + } + ], + "domain" : "{ Stmt_for_body6[i0, i1, i2] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 and i2 >= 0 and i2 <= 255 }", + "name" : "Stmt_for_body6", + "schedule" : "{ Stmt_for_body6[i0, i1, i2] -> scattering[0, o0, o1, o2, o3, i2, 0] : o0 >= 0 and o0 <= 7 and o1 >= 0 and o1 <= 15 and o2 >= 0 and o2 <= 7 and o3 >= 0 and o3 <= 15 and i0 = 16o0 + o1 and i1 = 16o2 + o3 }" + } + ] +} diff --git a/polly/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop b/polly/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop new file mode 100644 index 00000000000..693c5097312 --- /dev/null +++ b/polly/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop @@ -0,0 +1,17 @@ +{ + "context" : "{ : }", + "name" : "for.cond => for.end8", + "statements" : [ + { + "accesses" : [ + { + "kind" : "write", + "relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_A[128i0 + i1] }" + } + ], + "domain" : "{ Stmt_for_body3[i0, i1] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 }", + "name" : "Stmt_for_body3", + "schedule" : "{ Stmt_for_body3[i0, i1] -> scattering[0, i0, 0, i1, 0] }" + } + ] +} diff --git a/polly/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop.transformed+gpu b/polly/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop.transformed+gpu new file mode 100644 index 00000000000..fef61050e93 --- /dev/null +++ b/polly/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop.transformed+gpu @@ -0,0 +1,17 @@ +{ + "context" : "{ : }", + "name" : "for.cond => for.end8", + "statements" : [ + { + "accesses" : [ + { + "kind" : "write", + "relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_A[128i0 + i1] }" + } + ], + "domain" : "{ Stmt_for_body3[i0, i1] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 }", + "name" : "Stmt_for_body3", + "schedule" : "{ Stmt_for_body3[i0, i1] -> scattering[0, o0, o1, o2, o3]: o0 >= 0 and o0 <= 7 and o1 >= 0 and o1 <= 15 and o2 >= 0 and o2 <= 7 and o3 >= 0 and o3 <= 15 and i0 = 16o0 + o1 and i1 = 16o2 + o3 }" + } + ] +} diff --git a/polly/test/CodeGen/GPGPU/lit.local.cfg b/polly/test/CodeGen/GPGPU/lit.local.cfg new file mode 100644 index 00000000000..1e96dc6087c --- /dev/null +++ b/polly/test/CodeGen/GPGPU/lit.local.cfg @@ -0,0 +1,5 @@ +config.suffixes = ['.ll'] + +gpgpu = config.root.enable_gpgpu_codegen +if gpgpu not in ['TRUE', 'true'] : + config.unsupported = True |

