summaryrefslogtreecommitdiffstats
path: root/polly/test/CodeGen
diff options
context:
space:
mode:
authorTobias Grosser <grosser@fim.uni-passau.de>2012-08-03 12:50:07 +0000
committerTobias Grosser <grosser@fim.uni-passau.de>2012-08-03 12:50:07 +0000
commit6217e18a7dd50c9f112f8468f7c77fc260f40d89 (patch)
treecbf2767ae63ffb99467b07bc3edf7ec7d461aff0 /polly/test/CodeGen
parent7555b54020b1abcc1e67f66452600e2fa8f93dcd (diff)
downloadbcm5719-llvm-6217e18a7dd50c9f112f8468f7c77fc260f40d89.tar.gz
bcm5719-llvm-6217e18a7dd50c9f112f8468f7c77fc260f40d89.zip
Add preliminary implementation for GPGPU code generation.
Translate the selected parallel loop body into a ptx string and run it with the cuda driver API. We limit this preliminary implementation to target the following special test cases: - Support only 2-dimensional parallel loops with or without only one innermost non-parallel loop. - Support write memory access to only one array in a SCoP. The patch was committed with smaller changes to the build system: There is now a flag to enable gpu code generation explictly. This was required as we need the llvm.codegen() patch applied on the llvm sources, to compile this feature correctly. Also, enabling gpu code generation does not require cuda. This requirement was removed to allow 'make polly-test' runs, even without an installed cuda runtime. Contributed by: Yabin Hu <yabin.hwu@gmail.com> llvm-svn: 161239
Diffstat (limited to 'polly/test/CodeGen')
-rw-r--r--polly/test/CodeGen/GPGPU/2d_innermost_parallel.c16
-rw-r--r--polly/test/CodeGen/GPGPU/2d_innermost_parallel.ll65
-rw-r--r--polly/test/CodeGen/GPGPU/3d_innermost_non_parallel.c17
-rw-r--r--polly/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll88
-rw-r--r--polly/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop21
-rw-r--r--polly/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop.transformed+gpu21
-rw-r--r--polly/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop17
-rw-r--r--polly/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop.transformed+gpu17
-rw-r--r--polly/test/CodeGen/GPGPU/lit.local.cfg5
9 files changed, 267 insertions, 0 deletions
diff --git a/polly/test/CodeGen/GPGPU/2d_innermost_parallel.c b/polly/test/CodeGen/GPGPU/2d_innermost_parallel.c
new file mode 100644
index 00000000000..b6397d19429
--- /dev/null
+++ b/polly/test/CodeGen/GPGPU/2d_innermost_parallel.c
@@ -0,0 +1,16 @@
+int A[128][128];
+
+int gpu_pure() {
+ int i,j;
+
+ for(i = 0; i < 128; i++)
+ for(j = 0; j < 128; j++)
+ A[i][j] = i*128 + j;
+
+ return 0;
+}
+
+int main() {
+ int b = gpu_pure();
+ return 0;
+}
diff --git a/polly/test/CodeGen/GPGPU/2d_innermost_parallel.ll b/polly/test/CodeGen/GPGPU/2d_innermost_parallel.ll
new file mode 100644
index 00000000000..820280367ab
--- /dev/null
+++ b/polly/test/CodeGen/GPGPU/2d_innermost_parallel.ll
@@ -0,0 +1,65 @@
+; RUN: opt %loadPolly -basicaa -polly-import-jscop -polly-import-jscop-dir=%S -polly-import-jscop-postfix=transformed+gpu -enable-polly-gpgpu -polly-gpgpu-triple=nvptx64-unknown-unknown -polly-codegen %s -S | FileCheck %s
+; ModuleID = '2d_innermost_parallel.s'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = common global [128 x [128 x i32]] zeroinitializer, align 16
+
+define i32 @gpu_pure() nounwind uwtable {
+entry:
+ br label %for.cond
+
+for.cond: ; preds = %for.inc6, %entry
+ %indvars.iv2 = phi i64 [ %indvars.iv.next3, %for.inc6 ], [ 0, %entry ]
+ %lftr.wideiv5 = trunc i64 %indvars.iv2 to i32
+ %exitcond6 = icmp ne i32 %lftr.wideiv5, 128
+ br i1 %exitcond6, label %for.body, label %for.end8
+
+for.body: ; preds = %for.cond
+ br label %for.cond1
+
+for.cond1: ; preds = %for.inc, %for.body
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.body ]
+ %lftr.wideiv = trunc i64 %indvars.iv to i32
+ %exitcond = icmp ne i32 %lftr.wideiv, 128
+ br i1 %exitcond, label %for.body3, label %for.end
+
+for.body3: ; preds = %for.cond1
+ %tmp = shl nsw i64 %indvars.iv2, 7
+ %tmp7 = add nsw i64 %tmp, %indvars.iv
+ %arrayidx5 = getelementptr inbounds [128 x [128 x i32]]* @A, i64 0, i64 %indvars.iv2, i64 %indvars.iv
+ %tmp8 = trunc i64 %tmp7 to i32
+ store i32 %tmp8, i32* %arrayidx5, align 4
+ br label %for.inc
+
+for.inc: ; preds = %for.body3
+ %indvars.iv.next = add i64 %indvars.iv, 1
+ br label %for.cond1
+
+for.end: ; preds = %for.cond1
+ br label %for.inc6
+
+for.inc6: ; preds = %for.end
+ %indvars.iv.next3 = add i64 %indvars.iv2, 1
+ br label %for.cond
+
+for.end8: ; preds = %for.cond
+ ret i32 0
+}
+
+define i32 @main() nounwind uwtable {
+entry:
+ %call = call i32 @gpu_pure()
+ ret i32 0
+}
+
+; CHECK: call void @polly_initDevice
+; CHECK: call void @polly_getPTXModule
+; CHECK: call void @polly_getPTXKernelEntry
+; CHECK: call void @polly_allocateMemoryForHostAndDevice
+; CHECK: call void @polly_setKernelParameters
+; CHECK: call void @polly_startTimerByCudaEvent
+; CHECK: call void @polly_launchKernel
+; CHECK: call void @polly_copyFromDeviceToHost
+; CHECK: call void @polly_stopTimerByCudaEvent
+; CHECK: call void @polly_cleanupGPGPUResources
diff --git a/polly/test/CodeGen/GPGPU/3d_innermost_non_parallel.c b/polly/test/CodeGen/GPGPU/3d_innermost_non_parallel.c
new file mode 100644
index 00000000000..dae115ea3b4
--- /dev/null
+++ b/polly/test/CodeGen/GPGPU/3d_innermost_non_parallel.c
@@ -0,0 +1,17 @@
+int A[128][128];
+
+int gpu_no_pure() {
+ int i,j,k;
+
+ for(i = 0; i < 128; i++)
+ for(j = 0; j < 128; j++)
+ for(k = 0; k < 256; k++)
+ A[i][j] += i*123/(k+1)+5-j*k-123;
+
+ return 0;
+}
+
+int main() {
+ int b = gpu_no_pure();
+ return 0;
+}
diff --git a/polly/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll b/polly/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll
new file mode 100644
index 00000000000..588f581b2ef
--- /dev/null
+++ b/polly/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll
@@ -0,0 +1,88 @@
+; RUN: opt %loadPolly -basicaa -polly-import-jscop -polly-import-jscop-dir=%S -polly-import-jscop-postfix=transformed+gpu -enable-polly-gpgpu -polly-gpgpu-triple=nvptx64-unknown-unknown -polly-codegen %s -S | FileCheck %s
+; ModuleID = '3d_innermost_non_parallel.s'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = common global [128 x [128 x i32]] zeroinitializer, align 16
+
+define i32 @gpu_no_pure() nounwind uwtable {
+entry:
+ br label %for.cond
+
+for.cond: ; preds = %for.inc16, %entry
+ %indvars.iv2 = phi i64 [ %indvars.iv.next3, %for.inc16 ], [ 0, %entry ]
+ %lftr.wideiv5 = trunc i64 %indvars.iv2 to i32
+ %exitcond6 = icmp ne i32 %lftr.wideiv5, 128
+ br i1 %exitcond6, label %for.body, label %for.end18
+
+for.body: ; preds = %for.cond
+ br label %for.cond1
+
+for.cond1: ; preds = %for.inc13, %for.body
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc13 ], [ 0, %for.body ]
+ %lftr.wideiv = trunc i64 %indvars.iv to i32
+ %exitcond1 = icmp ne i32 %lftr.wideiv, 128
+ br i1 %exitcond1, label %for.body3, label %for.end15
+
+for.body3: ; preds = %for.cond1
+ br label %for.cond4
+
+for.cond4: ; preds = %for.inc, %for.body3
+ %k.0 = phi i32 [ 0, %for.body3 ], [ %inc, %for.inc ]
+ %exitcond = icmp ne i32 %k.0, 256
+ br i1 %exitcond, label %for.body6, label %for.end
+
+for.body6: ; preds = %for.cond4
+ %tmp = mul nsw i64 %indvars.iv2, 123
+ %add = add nsw i32 %k.0, 1
+ %tmp7 = trunc i64 %tmp to i32
+ %div = sdiv i32 %tmp7, %add
+ %add7 = add nsw i32 %div, 5
+ %tmp8 = trunc i64 %indvars.iv to i32
+ %mul8 = mul nsw i32 %tmp8, %k.0
+ %sub = sub nsw i32 %add7, %mul8
+ %sub9 = add nsw i32 %sub, -123
+ %arrayidx11 = getelementptr inbounds [128 x [128 x i32]]* @A, i64 0, i64 %indvars.iv2, i64 %indvars.iv
+ %tmp9 = load i32* %arrayidx11, align 4
+ %add12 = add nsw i32 %tmp9, %sub9
+ store i32 %add12, i32* %arrayidx11, align 4
+ br label %for.inc
+
+for.inc: ; preds = %for.body6
+ %inc = add nsw i32 %k.0, 1
+ br label %for.cond4
+
+for.end: ; preds = %for.cond4
+ br label %for.inc13
+
+for.inc13: ; preds = %for.end
+ %indvars.iv.next = add i64 %indvars.iv, 1
+ br label %for.cond1
+
+for.end15: ; preds = %for.cond1
+ br label %for.inc16
+
+for.inc16: ; preds = %for.end15
+ %indvars.iv.next3 = add i64 %indvars.iv2, 1
+ br label %for.cond
+
+for.end18: ; preds = %for.cond
+ ret i32 0
+}
+
+define i32 @main() nounwind uwtable {
+entry:
+ %call = call i32 @gpu_no_pure()
+ ret i32 0
+}
+
+; CHECK: call void @polly_initDevice
+; CHECK: call void @polly_getPTXModule
+; CHECK: call void @polly_getPTXKernelEntry
+; CHECK: call void @polly_allocateMemoryForHostAndDevice
+; CHECK: call void @polly_setKernelParameters
+; CHECK: call void @polly_startTimerByCudaEvent
+; CHECK: call void @polly_launchKernel
+; CHECK: call void @polly_copyFromDeviceToHost
+; CHECK: call void @polly_stopTimerByCudaEvent
+; CHECK: call void @polly_cleanupGPGPUResources
diff --git a/polly/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop b/polly/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop
new file mode 100644
index 00000000000..0d7a260a700
--- /dev/null
+++ b/polly/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop
@@ -0,0 +1,21 @@
+{
+ "context" : "{ : }",
+ "name" : "for.cond => for.end18",
+ "statements" : [
+ {
+ "accesses" : [
+ {
+ "kind" : "read",
+ "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }"
+ },
+ {
+ "kind" : "write",
+ "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }"
+ }
+ ],
+ "domain" : "{ Stmt_for_body6[i0, i1, i2] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 and i2 >= 0 and i2 <= 255 }",
+ "name" : "Stmt_for_body6",
+ "schedule" : "{ Stmt_for_body6[i0, i1, i2] -> scattering[0, i0, 0, i1, 0, i2, 0] }"
+ }
+ ]
+}
diff --git a/polly/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop.transformed+gpu b/polly/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop.transformed+gpu
new file mode 100644
index 00000000000..6f007fa7864
--- /dev/null
+++ b/polly/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop.transformed+gpu
@@ -0,0 +1,21 @@
+{
+ "context" : "{ : }",
+ "name" : "for.cond => for.end18",
+ "statements" : [
+ {
+ "accesses" : [
+ {
+ "kind" : "read",
+ "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }"
+ },
+ {
+ "kind" : "write",
+ "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }"
+ }
+ ],
+ "domain" : "{ Stmt_for_body6[i0, i1, i2] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 and i2 >= 0 and i2 <= 255 }",
+ "name" : "Stmt_for_body6",
+ "schedule" : "{ Stmt_for_body6[i0, i1, i2] -> scattering[0, o0, o1, o2, o3, i2, 0] : o0 >= 0 and o0 <= 7 and o1 >= 0 and o1 <= 15 and o2 >= 0 and o2 <= 7 and o3 >= 0 and o3 <= 15 and i0 = 16o0 + o1 and i1 = 16o2 + o3 }"
+ }
+ ]
+}
diff --git a/polly/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop b/polly/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop
new file mode 100644
index 00000000000..693c5097312
--- /dev/null
+++ b/polly/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop
@@ -0,0 +1,17 @@
+{
+ "context" : "{ : }",
+ "name" : "for.cond => for.end8",
+ "statements" : [
+ {
+ "accesses" : [
+ {
+ "kind" : "write",
+ "relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_A[128i0 + i1] }"
+ }
+ ],
+ "domain" : "{ Stmt_for_body3[i0, i1] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 }",
+ "name" : "Stmt_for_body3",
+ "schedule" : "{ Stmt_for_body3[i0, i1] -> scattering[0, i0, 0, i1, 0] }"
+ }
+ ]
+}
diff --git a/polly/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop.transformed+gpu b/polly/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop.transformed+gpu
new file mode 100644
index 00000000000..fef61050e93
--- /dev/null
+++ b/polly/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop.transformed+gpu
@@ -0,0 +1,17 @@
+{
+ "context" : "{ : }",
+ "name" : "for.cond => for.end8",
+ "statements" : [
+ {
+ "accesses" : [
+ {
+ "kind" : "write",
+ "relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_A[128i0 + i1] }"
+ }
+ ],
+ "domain" : "{ Stmt_for_body3[i0, i1] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 }",
+ "name" : "Stmt_for_body3",
+ "schedule" : "{ Stmt_for_body3[i0, i1] -> scattering[0, o0, o1, o2, o3]: o0 >= 0 and o0 <= 7 and o1 >= 0 and o1 <= 15 and o2 >= 0 and o2 <= 7 and o3 >= 0 and o3 <= 15 and i0 = 16o0 + o1 and i1 = 16o2 + o3 }"
+ }
+ ]
+}
diff --git a/polly/test/CodeGen/GPGPU/lit.local.cfg b/polly/test/CodeGen/GPGPU/lit.local.cfg
new file mode 100644
index 00000000000..1e96dc6087c
--- /dev/null
+++ b/polly/test/CodeGen/GPGPU/lit.local.cfg
@@ -0,0 +1,5 @@
+config.suffixes = ['.ll']
+
+gpgpu = config.root.enable_gpgpu_codegen
+if gpgpu not in ['TRUE', 'true'] :
+ config.unsupported = True
OpenPOWER on IntegriCloud