Add preliminary implementation for GPGPU code generation.

Translate the selected parallel loop body into a ptx string and run it with the cuda driver API. We limit this preliminary implementation to target the following special test cases: - Support only 2-dimensional parallel loops with or without only one innermost non-parallel loop. - Support write memory access to only one array in a SCoP. The patch was committed with smaller changes to the build system: There is now a flag to enable gpu code generation explictly. This was required as we need the llvm.codegen() patch applied on the llvm sources, to compile this feature correctly. Also, enabling gpu code generation does not require cuda. This requirement was removed to allow 'make polly-test' runs, even without an installed cuda runtime. Contributed by: Yabin Hu <yabin.hwu@gmail.com> llvm-svn: 161239
author: Tobias Grosser <grosser@fim.uni-passau.de> 2012-08-03 12:50:07 +0000
committer: Tobias Grosser <grosser@fim.uni-passau.de> 2012-08-03 12:50:07 +0000
commit: 6217e18a7dd50c9f112f8468f7c77fc260f40d89 (patch)
tree: cbf2767ae63ffb99467b07bc3edf7ec7d461aff0 /polly/test/CodeGen
parent: 7555b54020b1abcc1e67f66452600e2fa8f93dcd (diff)
download: bcm5719-llvm-6217e18a7dd50c9f112f8468f7c77fc260f40d89.tar.gz
bcm5719-llvm-6217e18a7dd50c9f112f8468f7c77fc260f40d89.zip
9 files changed, 267 insertions, 0 deletions
diff --git a/polly/test/CodeGen/GPGPU/2d_innermost_parallel.c b/polly/test/CodeGen/GPGPU/2d_innermost_parallel.c
new file mode 100644
index 00000000000..b6397d19429
--- /dev/null
+++ b/polly/test/CodeGen/GPGPU/2d_innermost_parallel.c
@@ -0,0 +1,16 @@
+int A[128][128];
+
+int gpu_pure() {
+  int i,j;
+
+  for(i = 0; i < 128; i++)
+    for(j = 0; j < 128; j++)
+      A[i][j] = i*128 + j;
+
+  return 0;
+}
+
+int main() {
+  int b = gpu_pure();
+  return 0;
+}
diff --git a/polly/test/CodeGen/GPGPU/2d_innermost_parallel.ll b/polly/test/CodeGen/GPGPU/2d_innermost_parallel.ll
new file mode 100644
index 00000000000..820280367ab
--- /dev/null
+++ b/polly/test/CodeGen/GPGPU/2d_innermost_parallel.ll
@@ -0,0 +1,65 @@
+; RUN: opt %loadPolly -basicaa -polly-import-jscop -polly-import-jscop-dir=%S -polly-import-jscop-postfix=transformed+gpu -enable-polly-gpgpu -polly-gpgpu-triple=nvptx64-unknown-unknown -polly-codegen %s -S | FileCheck %s
+; ModuleID = '2d_innermost_parallel.s'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = common global [128 x [128 x i32]] zeroinitializer, align 16
+
+define i32 @gpu_pure() nounwind uwtable {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc6, %entry
+  %indvars.iv2 = phi i64 [ %indvars.iv.next3, %for.inc6 ], [ 0, %entry ]
+  %lftr.wideiv5 = trunc i64 %indvars.iv2 to i32
+  %exitcond6 = icmp ne i32 %lftr.wideiv5, 128
+  br i1 %exitcond6, label %for.body, label %for.end8
+
+for.body:                                         ; preds = %for.cond
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.body ]
+  %lftr.wideiv = trunc i64 %indvars.iv to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 128
+  br i1 %exitcond, label %for.body3, label %for.end
+
+for.body3:                                        ; preds = %for.cond1
+  %tmp = shl nsw i64 %indvars.iv2, 7
+  %tmp7 = add nsw i64 %tmp, %indvars.iv
+  %arrayidx5 = getelementptr inbounds [128 x [128 x i32]]* @A, i64 0, i64 %indvars.iv2, i64 %indvars.iv
+  %tmp8 = trunc i64 %tmp7 to i32
+  store i32 %tmp8, i32* %arrayidx5, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body3
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  br label %for.cond1
+
+for.end:                                          ; preds = %for.cond1
+  br label %for.inc6
+
+for.inc6:                                         ; preds = %for.end
+  %indvars.iv.next3 = add i64 %indvars.iv2, 1
+  br label %for.cond
+
+for.end8:                                         ; preds = %for.cond
+  ret i32 0
+}
+
+define i32 @main() nounwind uwtable {
+entry:
+  %call = call i32 @gpu_pure()
+  ret i32 0
+}
+
+; CHECK:  call void @polly_initDevice
+; CHECK:  call void @polly_getPTXModule
+; CHECK:  call void @polly_getPTXKernelEntry
+; CHECK:  call void @polly_allocateMemoryForHostAndDevice
+; CHECK:  call void @polly_setKernelParameters
+; CHECK:  call void @polly_startTimerByCudaEvent
+; CHECK:  call void @polly_launchKernel
+; CHECK:  call void @polly_copyFromDeviceToHost
+; CHECK:  call void @polly_stopTimerByCudaEvent
+; CHECK:  call void @polly_cleanupGPGPUResources
diff --git a/polly/test/CodeGen/GPGPU/3d_innermost_non_parallel.c b/polly/test/CodeGen/GPGPU/3d_innermost_non_parallel.c
new file mode 100644
index 00000000000..dae115ea3b4
--- /dev/null
+++ b/polly/test/CodeGen/GPGPU/3d_innermost_non_parallel.c
@@ -0,0 +1,17 @@
+int A[128][128];
+
+int gpu_no_pure() {
+  int i,j,k;
+
+  for(i = 0; i < 128; i++)
+    for(j = 0; j < 128; j++)
+      for(k = 0; k < 256; k++)
+        A[i][j] += i*123/(k+1)+5-j*k-123;
+
+  return 0;
+}
+
+int main() {
+  int b = gpu_no_pure();
+  return 0;
+}
diff --git a/polly/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll b/polly/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll
new file mode 100644
index 00000000000..588f581b2ef
--- /dev/null
+++ b/polly/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll
@@ -0,0 +1,88 @@
+; RUN: opt %loadPolly -basicaa -polly-import-jscop -polly-import-jscop-dir=%S -polly-import-jscop-postfix=transformed+gpu -enable-polly-gpgpu -polly-gpgpu-triple=nvptx64-unknown-unknown -polly-codegen %s -S | FileCheck %s
+; ModuleID = '3d_innermost_non_parallel.s'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = common global [128 x [128 x i32]] zeroinitializer, align 16
+
+define i32 @gpu_no_pure() nounwind uwtable {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc16, %entry
+  %indvars.iv2 = phi i64 [ %indvars.iv.next3, %for.inc16 ], [ 0, %entry ]
+  %lftr.wideiv5 = trunc i64 %indvars.iv2 to i32
+  %exitcond6 = icmp ne i32 %lftr.wideiv5, 128
+  br i1 %exitcond6, label %for.body, label %for.end18
+
+for.body:                                         ; preds = %for.cond
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc13, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc13 ], [ 0, %for.body ]
+  %lftr.wideiv = trunc i64 %indvars.iv to i32
+  %exitcond1 = icmp ne i32 %lftr.wideiv, 128
+  br i1 %exitcond1, label %for.body3, label %for.end15
+
+for.body3:                                        ; preds = %for.cond1
+  br label %for.cond4
+
+for.cond4:                                        ; preds = %for.inc, %for.body3
+  %k.0 = phi i32 [ 0, %for.body3 ], [ %inc, %for.inc ]
+  %exitcond = icmp ne i32 %k.0, 256
+  br i1 %exitcond, label %for.body6, label %for.end
+
+for.body6:                                        ; preds = %for.cond4
+  %tmp = mul nsw i64 %indvars.iv2, 123
+  %add = add nsw i32 %k.0, 1
+  %tmp7 = trunc i64 %tmp to i32
+  %div = sdiv i32 %tmp7, %add
+  %add7 = add nsw i32 %div, 5
+  %tmp8 = trunc i64 %indvars.iv to i32
+  %mul8 = mul nsw i32 %tmp8, %k.0
+  %sub = sub nsw i32 %add7, %mul8
+  %sub9 = add nsw i32 %sub, -123
+  %arrayidx11 = getelementptr inbounds [128 x [128 x i32]]* @A, i64 0, i64 %indvars.iv2, i64 %indvars.iv
+  %tmp9 = load i32* %arrayidx11, align 4
+  %add12 = add nsw i32 %tmp9, %sub9
+  store i32 %add12, i32* %arrayidx11, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body6
+  %inc = add nsw i32 %k.0, 1
+  br label %for.cond4
+
+for.end:                                          ; preds = %for.cond4
+  br label %for.inc13
+
+for.inc13:                                        ; preds = %for.end
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  br label %for.cond1
+
+for.end15:                                        ; preds = %for.cond1
+  br label %for.inc16
+
+for.inc16:                                        ; preds = %for.end15
+  %indvars.iv.next3 = add i64 %indvars.iv2, 1
+  br label %for.cond
+
+for.end18:                                        ; preds = %for.cond
+  ret i32 0
+}
+
+define i32 @main() nounwind uwtable {
+entry:
+  %call = call i32 @gpu_no_pure()
+  ret i32 0
+}
+
+; CHECK:  call void @polly_initDevice
+; CHECK:  call void @polly_getPTXModule
+; CHECK:  call void @polly_getPTXKernelEntry
+; CHECK:  call void @polly_allocateMemoryForHostAndDevice
+; CHECK:  call void @polly_setKernelParameters
+; CHECK:  call void @polly_startTimerByCudaEvent
+; CHECK:  call void @polly_launchKernel
+; CHECK:  call void @polly_copyFromDeviceToHost
+; CHECK:  call void @polly_stopTimerByCudaEvent
+; CHECK:  call void @polly_cleanupGPGPUResources
diff --git a/polly/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop b/polly/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop
new file mode 100644
index 00000000000..0d7a260a700
--- /dev/null
+++ b/polly/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop
@@ -0,0 +1,21 @@
+{
+   "context" : "{  :  }",
+   "name" : "for.cond => for.end18",
+   "statements" : [
+      {
+         "accesses" : [
+            {
+               "kind" : "read",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }"
+            },
+            {
+               "kind" : "write",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }"
+            }
+         ],
+         "domain" : "{ Stmt_for_body6[i0, i1, i2] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 and i2 >= 0 and i2 <= 255 }",
+         "name" : "Stmt_for_body6",
+         "schedule" : "{ Stmt_for_body6[i0, i1, i2] -> scattering[0, i0, 0, i1, 0, i2, 0] }"
+      }
+   ]
+}
diff --git a/polly/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop.transformed+gpu b/polly/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop.transformed+gpu
new file mode 100644
index 00000000000..6f007fa7864
--- /dev/null
+++ b/polly/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop.transformed+gpu
@@ -0,0 +1,21 @@
+{
+   "context" : "{  :  }",
+   "name" : "for.cond => for.end18",
+   "statements" : [
+      {
+         "accesses" : [
+            {
+               "kind" : "read",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }"
+            },
+            {
+               "kind" : "write",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }"
+            }
+         ],
+         "domain" : "{ Stmt_for_body6[i0, i1, i2] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 and i2 >= 0 and i2 <= 255 }",
+         "name" : "Stmt_for_body6",
+         "schedule" : "{ Stmt_for_body6[i0, i1, i2] -> scattering[0, o0, o1, o2, o3, i2, 0] : o0 >= 0 and o0 <= 7 and o1 >= 0 and o1 <= 15 and o2 >= 0 and o2 <= 7 and o3 >= 0 and o3 <= 15 and i0 = 16o0 + o1 and i1 = 16o2 + o3 }"
+      }
+   ]
+}
diff --git a/polly/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop b/polly/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop
new file mode 100644
index 00000000000..693c5097312
--- /dev/null
+++ b/polly/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop
@@ -0,0 +1,17 @@
+{
+   "context" : "{  :  }",
+   "name" : "for.cond => for.end8",
+   "statements" : [
+      {
+         "accesses" : [
+            {
+               "kind" : "write",
+               "relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_A[128i0 + i1] }"
+            }
+         ],
+         "domain" : "{ Stmt_for_body3[i0, i1] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 }",
+         "name" : "Stmt_for_body3",
+         "schedule" : "{ Stmt_for_body3[i0, i1] -> scattering[0, i0, 0, i1, 0] }"
+      }
+   ]
+}
diff --git a/polly/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop.transformed+gpu b/polly/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop.transformed+gpu
new file mode 100644
index 00000000000..fef61050e93
--- /dev/null
+++ b/polly/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop.transformed+gpu
@@ -0,0 +1,17 @@
+{
+   "context" : "{  :  }",
+   "name" : "for.cond => for.end8",
+   "statements" : [
+      {
+         "accesses" : [
+            {
+               "kind" : "write",
+               "relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_A[128i0 + i1] }"
+            }
+         ],
+         "domain" : "{ Stmt_for_body3[i0, i1] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 }",
+         "name" : "Stmt_for_body3",
+         "schedule" : "{ Stmt_for_body3[i0, i1] -> scattering[0, o0, o1, o2, o3]: o0 >= 0 and o0 <= 7 and o1 >= 0 and o1 <= 15 and o2 >= 0 and o2 <= 7 and o3 >= 0 and o3 <= 15 and i0 = 16o0 + o1 and i1 = 16o2 + o3 }"
+      }
+   ]
+}
diff --git a/polly/test/CodeGen/GPGPU/lit.local.cfg b/polly/test/CodeGen/GPGPU/lit.local.cfg
new file mode 100644
index 00000000000..1e96dc6087c
--- /dev/null
+++ b/polly/test/CodeGen/GPGPU/lit.local.cfg
@@ -0,0 +1,5 @@
+config.suffixes = ['.ll']
+
+gpgpu = config.root.enable_gpgpu_codegen
+if gpgpu not in ['TRUE', 'true'] :
+    config.unsupported = True
author	Tobias Grosser <grosser@fim.uni-passau.de>	2012-08-03 12:50:07 +0000
committer	Tobias Grosser <grosser@fim.uni-passau.de>	2012-08-03 12:50:07 +0000
commit	6217e18a7dd50c9f112f8468f7c77fc260f40d89 (patch)
tree	cbf2767ae63ffb99467b07bc3edf7ec7d461aff0 /polly/test/CodeGen
parent	7555b54020b1abcc1e67f66452600e2fa8f93dcd (diff)
download	bcm5719-llvm-6217e18a7dd50c9f112f8468f7c77fc260f40d89.tar.gz bcm5719-llvm-6217e18a7dd50c9f112f8468f7c77fc260f40d89.zip