summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTobias Grosser <tobias@grosser.es>2016-08-05 08:27:24 +0000
committerTobias Grosser <tobias@grosser.es>2016-08-05 08:27:24 +0000
commit928d7573ddace6e2bd0cd220d661c3e8cbae128e (patch)
tree7d00fb1176c922db1af223cddb2b5b530df3881a
parent870bf1788ca95986dacf4a1b56bbc2cf7127c230 (diff)
downloadbcm5719-llvm-928d7573ddace6e2bd0cd220d661c3e8cbae128e.tar.gz
bcm5719-llvm-928d7573ddace6e2bd0cd220d661c3e8cbae128e.zip
GPGPU: Sort dimension sizes of multi-dimensional shared memory arrays correctly
Before this commit we generated the array type in reverse order and we also added the outermost dimension size to the new array declaration, which is incorrect as Polly additionally assumed an additional unsized outermost dimension, such that we had an off-by-one error in the linearization of access expressions. llvm-svn: 277802
-rw-r--r--polly/lib/CodeGen/PPCGCodeGeneration.cpp8
-rw-r--r--polly/test/GPGPU/shared-memory-two-dimensional.ll103
2 files changed, 110 insertions, 1 deletions
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
index d418300d6f7..fa37f097f4c 100644
--- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp
+++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
@@ -1292,11 +1292,17 @@ void GPUNodeBuilder::createKernelVariables(ppcg_kernel *Kernel, Function *FN) {
Type *ArrayTy = EleTy;
SmallVector<const SCEV *, 4> Sizes;
- for (unsigned int j = 0; j < Var.array->n_index; ++j) {
+ for (unsigned int j = 1; j < Var.array->n_index; ++j) {
isl_val *Val = isl_vec_get_element_val(Var.size, j);
long Bound = isl_val_get_num_si(Val);
isl_val_free(Val);
Sizes.push_back(S.getSE()->getConstant(Builder.getInt64Ty(), Bound));
+ }
+
+ for (int j = Var.array->n_index - 1; j >= 0; --j) {
+ isl_val *Val = isl_vec_get_element_val(Var.size, j);
+ long Bound = isl_val_get_num_si(Val);
+ isl_val_free(Val);
ArrayTy = ArrayType::get(ArrayTy, Bound);
}
diff --git a/polly/test/GPGPU/shared-memory-two-dimensional.ll b/polly/test/GPGPU/shared-memory-two-dimensional.ll
new file mode 100644
index 00000000000..4b4fcfe3205
--- /dev/null
+++ b/polly/test/GPGPU/shared-memory-two-dimensional.ll
@@ -0,0 +1,103 @@
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
+; RUN: -polly-acc-use-shared \
+; RUN: -disable-output < %s | \
+; RUN: FileCheck -check-prefix=CODE %s
+
+; RUN: opt %loadPolly -polly-codegen-ppcg \
+; RUN: -polly-acc-use-shared \
+; RUN: -disable-output -polly-acc-dump-kernel-ir < %s | \
+; RUN: FileCheck -check-prefix=KERNEL %s
+
+; REQUIRES: pollyacc
+
+; void foo(float A[], float b[][8]) {
+; for (long i = 0; i < 32; i++)
+; for (long j = 0; j < 16; j++)
+; for (long k = 0; k < 8; k++)
+; A[i] += j * k * b[j][k];
+; }
+
+
+; CODE: # kernel0
+; CODE-NEXT: {
+; CODE-NEXT: if (t0 <= 7)
+; CODE-NEXT: for (int c0 = 0; c0 <= 15; c0 += 1)
+; CODE-NEXT: read(c0, t0);
+; CODE-NEXT: read(t0);
+; CODE-NEXT: sync0();
+; CODE-NEXT: for (int c3 = 0; c3 <= 15; c3 += 1)
+; CODE-NEXT: for (int c4 = 0; c4 <= 7; c4 += 1)
+; CODE-NEXT: Stmt_bb8(t0, c3, c4);
+; CODE-NEXT: sync1();
+; CODE-NEXT: write(t0);
+; CODE-NEXT: }
+
+; KERNEL: @shared_MemRef_b = internal addrspace(3) global [16 x [8 x float]] zeroinitializer, align 4
+
+; KERNEL: %polly.access.mul.MemRef_b = mul nsw i64 %polly.indvar, 8
+; KERNEL-NEXT: %polly.access.add.MemRef_b = add nsw i64 %polly.access.mul.MemRef_b, %t0
+; KERNEL-NEXT: %polly.access.MemRef_b = getelementptr float, float* %polly.access.cast.MemRef_b, i64 %polly.access.add.MemRef_b
+; KERNEL-NEXT: %shared.read = load float, float* %polly.access.MemRef_b
+; KERNEL-NEXT: store float %shared.read, float addrspace(3)* %polly.access.shared_MemRef_b
+
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @foo(float* %A, [8 x float]* %b) {
+bb:
+ br label %bb3
+
+bb3: ; preds = %bb22, %bb
+ %i.0 = phi i64 [ 0, %bb ], [ %tmp23, %bb22 ]
+ %exitcond2 = icmp ne i64 %i.0, 32
+ br i1 %exitcond2, label %bb4, label %bb24
+
+bb4: ; preds = %bb3
+ br label %bb5
+
+bb5: ; preds = %bb19, %bb4
+ %j.0 = phi i64 [ 0, %bb4 ], [ %tmp20, %bb19 ]
+ %exitcond1 = icmp ne i64 %j.0, 16
+ br i1 %exitcond1, label %bb6, label %bb21
+
+bb6: ; preds = %bb5
+ br label %bb7
+
+bb7: ; preds = %bb16, %bb6
+ %k.0 = phi i64 [ 0, %bb6 ], [ %tmp17, %bb16 ]
+ %exitcond = icmp ne i64 %k.0, 8
+ br i1 %exitcond, label %bb8, label %bb18
+
+bb8: ; preds = %bb7
+ %tmp = mul nuw nsw i64 %j.0, %k.0
+ %tmp9 = sitofp i64 %tmp to float
+ %tmp10 = getelementptr inbounds [8 x float], [8 x float]* %b, i64 %j.0, i64 %k.0
+ %tmp11 = load float, float* %tmp10, align 4
+ %tmp12 = fmul float %tmp9, %tmp11
+ %tmp13 = getelementptr inbounds float, float* %A, i64 %i.0
+ %tmp14 = load float, float* %tmp13, align 4
+ %tmp15 = fadd float %tmp14, %tmp12
+ store float %tmp15, float* %tmp13, align 4
+ br label %bb16
+
+bb16: ; preds = %bb8
+ %tmp17 = add nuw nsw i64 %k.0, 1
+ br label %bb7
+
+bb18: ; preds = %bb7
+ br label %bb19
+
+bb19: ; preds = %bb18
+ %tmp20 = add nuw nsw i64 %j.0, 1
+ br label %bb5
+
+bb21: ; preds = %bb5
+ br label %bb22
+
+bb22: ; preds = %bb21
+ %tmp23 = add nuw nsw i64 %i.0, 1
+ br label %bb3
+
+bb24: ; preds = %bb3
+ ret void
+}
OpenPOWER on IntegriCloud