summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--polly/lib/CodeGen/PPCGCodeGeneration.cpp130
-rw-r--r--polly/test/GPGPU/non-read-only-scalars.ll1
-rw-r--r--polly/test/GPGPU/phi-nodes-in-kernel.ll9
-rw-r--r--polly/test/GPGPU/privatization-simple.ll56
-rw-r--r--polly/test/GPGPU/privatization.ll60
5 files changed, 243 insertions, 13 deletions
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
index eb46d453550..b882455c620 100644
--- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp
+++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
@@ -112,6 +112,111 @@ static cl::opt<int>
cl::desc("Minimal number of compute statements to run on GPU."),
cl::Hidden, cl::init(10 * 512 * 512));
+/// Used to store information PPCG wants for kills. This information is
+/// used by live range reordering.
+///
+/// @see computeLiveRangeReordering
+/// @see GPUNodeBuilder::createPPCGScop
+/// @see GPUNodeBuilder::createPPCGProg
+struct MustKillsInfo {
+ /// Collection of all kill statements that will be sequenced at the end of
+ /// PPCGScop->schedule.
+ ///
+ /// The nodes in `KillsSchedule` will be merged using `isl_schedule_set`
+ /// which merges schedules in *arbitrary* order.
+ /// (we don't care about the order of the kills anyway).
+ isl::schedule KillsSchedule;
+ /// Map from kill statement instances to scalars that need to be
+ /// killed.
+ ///
+ /// We currently only derive kill information for phi nodes, as phi nodes
+ /// allow us to easily derive kill information. PHI nodes are not alive
+ /// outside the scop and can consequently all be "killed". [params] -> {
+ /// [Stmt_phantom[] -> ref_phantom[]] -> phi_ref[] }
+ isl::union_map TaggedMustKills;
+
+ MustKillsInfo() : KillsSchedule(nullptr), TaggedMustKills(nullptr){};
+};
+
+/// Compute must-kills needed to enable live range reordering with PPCG.
+///
+/// @params S The Scop to compute live range reordering information
+/// @returns live range reordering information that can be used to setup
+/// PPCG.
+static MustKillsInfo computeMustKillsInfo(const Scop &S) {
+ const isl::space ParamSpace(isl::manage(S.getParamSpace()));
+ MustKillsInfo Info;
+
+ // 1. Collect phi nodes in scop.
+ SmallVector<isl::id, 4> KillMemIds;
+ for (ScopArrayInfo *SAI : S.arrays()) {
+ if (!SAI->isPHIKind())
+ continue;
+
+ KillMemIds.push_back(isl::manage(SAI->getBasePtrId()));
+ }
+
+ Info.TaggedMustKills = isl::union_map::empty(isl::space(ParamSpace));
+
+ // Initialising KillsSchedule to `isl_set_empty` creates an empty node in the
+ // schedule:
+ // - filter: "[control] -> { }"
+ // So, we choose to not create this to keep the output a little nicer,
+ // at the cost of some code complexity.
+ Info.KillsSchedule = nullptr;
+
+ for (isl::id &phiId : KillMemIds) {
+ isl::id KillStmtId = isl::id::alloc(
+ S.getIslCtx(), std::string("SKill_phantom_").append(phiId.get_name()),
+ nullptr);
+
+ // NOTE: construction of tagged_must_kill:
+ // 2. We need to construct a map:
+ // [param] -> { [Stmt_phantom[] -> ref_phantom[]] -> phi_ref }
+ // To construct this, we use `isl_map_domain_product` on 2 maps`:
+ // 2a. StmtToPhi:
+ // [param] -> { Stmt_phantom[] -> phi_ref[] }
+ // 2b. PhantomRefToPhi:
+ // [param] -> { ref_phantom[] -> phi_ref[] }
+ //
+ // Combining these with `isl_map_domain_product` gives us
+ // TaggedMustKill:
+ // [param] -> { [Stmt[] -> phantom_ref[]] -> memref[] }
+
+ // 2a. [param] -> { S_2[] -> phi_ref[] }
+ isl::map StmtToPhi = isl::map::universe(isl::space(ParamSpace));
+ StmtToPhi = StmtToPhi.set_tuple_id(isl::dim::in, isl::id(KillStmtId));
+ StmtToPhi = StmtToPhi.set_tuple_id(isl::dim::out, isl::id(phiId));
+
+ isl::id PhantomRefId = isl::id::alloc(
+ S.getIslCtx(), std::string("ref_phantom") + phiId.get_name(), nullptr);
+
+ // 2b. [param] -> { phantom_ref[] -> memref[] }
+ isl::map PhantomRefToPhi = isl::map::universe(isl::space(ParamSpace));
+ PhantomRefToPhi = PhantomRefToPhi.set_tuple_id(isl::dim::in, PhantomRefId);
+ PhantomRefToPhi = PhantomRefToPhi.set_tuple_id(isl::dim::out, phiId);
+
+ // 2. [param] -> { [Stmt[] -> phantom_ref[]] -> memref[] }
+ isl::map TaggedMustKill = StmtToPhi.domain_product(PhantomRefToPhi);
+ Info.TaggedMustKills = Info.TaggedMustKills.unite(TaggedMustKill);
+
+ // 3. Create the kill schedule of the form:
+ // "[param] -> { Stmt_phantom[] }"
+ // Then add this to Info.KillsSchedule.
+ isl::space KillStmtSpace = ParamSpace;
+ KillStmtSpace = KillStmtSpace.set_tuple_id(isl::dim::set, KillStmtId);
+ isl::union_set KillStmtDomain = isl::set::universe(KillStmtSpace);
+
+ isl::schedule KillSchedule = isl::schedule::from_domain(KillStmtDomain);
+ if (Info.KillsSchedule)
+ Info.KillsSchedule = Info.KillsSchedule.set(KillSchedule);
+ else
+ Info.KillsSchedule = KillSchedule;
+ }
+
+ return Info;
+}
+
/// Create the ast expressions for a ScopStmt.
///
/// This function is a callback for to generate the ast expressions for each
@@ -2114,6 +2219,8 @@ public:
auto PPCGScop = (ppcg_scop *)malloc(sizeof(ppcg_scop));
PPCGScop->options = createPPCGOptions();
+ // enable live range reordering
+ PPCGScop->options->live_range_reordering = 1;
PPCGScop->start = 0;
PPCGScop->end = 0;
@@ -2129,10 +2236,9 @@ public:
PPCGScop->tagged_must_writes = getTaggedMustWrites();
PPCGScop->must_writes = S->getMustWrites();
PPCGScop->live_out = nullptr;
- PPCGScop->tagged_must_kills = isl_union_map_empty(S->getParamSpace());
PPCGScop->tagger = nullptr;
-
- PPCGScop->independence = nullptr;
+ PPCGScop->independence =
+ isl_union_map_empty(isl_set_get_space(PPCGScop->context));
PPCGScop->dep_flow = nullptr;
PPCGScop->tagged_dep_flow = nullptr;
PPCGScop->dep_false = nullptr;
@@ -2141,8 +2247,15 @@ public:
PPCGScop->tagged_dep_order = nullptr;
PPCGScop->schedule = S->getScheduleTree();
- PPCGScop->names = getNames();
+ MustKillsInfo KillsInfo = computeMustKillsInfo(*S);
+ // If we have something non-trivial to kill, add it to the schedule
+ if (KillsInfo.KillsSchedule.get())
+ PPCGScop->schedule = isl_schedule_sequence(
+ PPCGScop->schedule, KillsInfo.KillsSchedule.take());
+ PPCGScop->tagged_must_kills = KillsInfo.TaggedMustKills.take();
+
+ PPCGScop->names = getNames();
PPCGScop->pet = nullptr;
compute_tagger(PPCGScop);
@@ -2414,7 +2527,13 @@ public:
PPCGProg->to_inner = getArrayIdentity();
PPCGProg->to_outer = getArrayIdentity();
PPCGProg->any_to_outer = nullptr;
- PPCGProg->array_order = nullptr;
+
+ // this needs to be set when live range reordering is enabled.
+ // NOTE: I believe that is conservatively correct. I'm not sure
+ // what the semantics of this is.
+ // Quoting PPCG/gpu.h: "Order dependences on non-scalars."
+ PPCGProg->array_order =
+ isl_union_map_empty(isl_set_get_space(PPCGScop->context));
PPCGProg->n_stmts = std::distance(S->begin(), S->end());
PPCGProg->stmts = getStatements();
PPCGProg->n_array = std::distance(S->array_begin(), S->array_end());
@@ -2424,7 +2543,6 @@ public:
createArrays(PPCGProg);
PPCGProg->may_persist = compute_may_persist(PPCGProg);
-
return PPCGProg;
}
diff --git a/polly/test/GPGPU/non-read-only-scalars.ll b/polly/test/GPGPU/non-read-only-scalars.ll
index ba2674c84aa..465acaf48b5 100644
--- a/polly/test/GPGPU/non-read-only-scalars.ll
+++ b/polly/test/GPGPU/non-read-only-scalars.ll
@@ -67,7 +67,6 @@
; CODE: }
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (32) * sizeof(float), cudaMemcpyDeviceToHost));
-; CODE-NEXT: cudaCheckReturn(cudaMemcpy(&MemRef_sum_0__phi, dev_MemRef_sum_0__phi, sizeof(float), cudaMemcpyDeviceToHost));
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(&MemRef_sum_0, dev_MemRef_sum_0, sizeof(float), cudaMemcpyDeviceToHost));
; CODE-NEXT: }
diff --git a/polly/test/GPGPU/phi-nodes-in-kernel.ll b/polly/test/GPGPU/phi-nodes-in-kernel.ll
index e2780cb2576..eb3203029f8 100644
--- a/polly/test/GPGPU/phi-nodes-in-kernel.ll
+++ b/polly/test/GPGPU/phi-nodes-in-kernel.ll
@@ -24,9 +24,8 @@ target triple = "x86_64-unknown-linux-gnu"
; CODE-NEXT: cudaCheckKernel();
; CODE-NEXT: }
-; CODE: cudaCheckReturn(cudaMemcpy(&MemRef_out_l_055__phi, dev_MemRef_out_l_055__phi, sizeof(i32), cudaMemcpyDeviceToHost));
-; CODE-NEXT: cudaCheckReturn(cudaMemcpy(&MemRef_out_l_055, dev_MemRef_out_l_055, sizeof(i32), cudaMemcpyDeviceToHost));
-; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_c, dev_MemRef_c, (50) * sizeof(i32), cudaMemcpyDeviceToHost));
+; CODE: cudaCheckReturn(cudaMemcpy(&MemRef_out_l_055, dev_MemRef_out_l_055, sizeof(i32), cudaMemcpyDeviceToHost));
+; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_c, dev_MemRef_c, (50) * sizeof(i32), cudaMemcpyDeviceToHost));
; CODE-NEXT: }
; CODE: # kernel0
@@ -41,9 +40,7 @@ target triple = "x86_64-unknown-linux-gnu"
; IR: [[REGA:%.+]] = bitcast i32* %out_l.055.phiops to i8*
; IR-NEXT: call void @polly_copyFromHostToDevice(i8* [[REGA]], i8* %p_dev_array_MemRef_out_l_055__phi, i64 4)
-; IR: [[REGB:%.+]] = bitcast i32* %out_l.055.phiops to i8*
-; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055__phi, i8* [[REGB]], i64 4)
-; IR-NEXT: [[REGC:%.+]] = bitcast i32* %out_l.055.s2a to i8*
+; IR: [[REGC:%.+]] = bitcast i32* %out_l.055.s2a to i8*
; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055, i8* [[REGC]], i64 4)
; KERNEL-IR: entry:
diff --git a/polly/test/GPGPU/privatization-simple.ll b/polly/test/GPGPU/privatization-simple.ll
new file mode 100644
index 00000000000..a6484379e5d
--- /dev/null
+++ b/polly/test/GPGPU/privatization-simple.ll
@@ -0,0 +1,56 @@
+; RUN: opt %loadPolly -analyze -polly-scops < %s | FileCheck %s -check-prefix=SCOP
+; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR
+
+; SCOP: Function: f
+; SCOP-NEXT: Region: %for.body---%for.end
+; SCOP-NEXT: Max Loop Depth: 1
+
+; Check that kernel launch is generated in host IR.
+; the declare would not be generated unless a call to a kernel exists.
+; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
+
+; void f(int A[], int B[], int control, int C[]) {
+; int x;
+; #pragma scop
+; for(int i = 0; i < 1000; i ++) {
+; x = 0;
+; if(control) x = C[i];
+; B[i] = x * A[i];
+;
+; }
+; #pragma endscop
+; }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @f(i32* %A, i32* %B, i32 %control, i32* %C) {
+entry:
+ br label %entry.split
+
+entry.split: ; preds = %entry
+ br label %for.body
+
+for.body: ; preds = %entry.split, %if.end
+ %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %if.end ]
+ %tobool = icmp eq i32 %control, 0
+ br i1 %tobool, label %if.end, label %if.then
+
+if.then: ; preds = %for.body
+ %arrayidx = getelementptr inbounds i32, i32* %C, i64 %indvars.iv
+ %tmp4 = load i32, i32* %arrayidx, align 4
+ br label %if.end
+
+if.end: ; preds = %for.body, %if.then
+ %x.0 = phi i32 [ %tmp4, %if.then ], [ 0, %for.body ]
+ %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+ %tmp8 = load i32, i32* %arrayidx2, align 4
+ %mul = mul nsw i32 %tmp8, %x.0
+ %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+ store i32 %mul, i32* %arrayidx4, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp ne i64 %indvars.iv.next, 1000
+ br i1 %exitcond, label %for.body, label %for.end
+
+for.end: ; preds = %if.end
+ ret void
+}
diff --git a/polly/test/GPGPU/privatization.ll b/polly/test/GPGPU/privatization.ll
new file mode 100644
index 00000000000..f2a4ef238bf
--- /dev/null
+++ b/polly/test/GPGPU/privatization.ll
@@ -0,0 +1,60 @@
+; RUN: opt %loadPolly -analyze -polly-scops < %s | FileCheck %s -check-prefix=SCOP
+; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR
+
+; SCOP: Function: checkPrivatization
+; SCOP-NEXT: Region: %for.body---%for.end
+; SCOP-NEXT: Max Loop Depth: 1
+
+
+; Check that kernel launch is generated in host IR.
+; the declare would not be generated unless a call to a kernel exists.
+; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
+
+;
+;
+; void checkPrivatization(int A[], int B[], int C[], int control) {
+; int x;
+; #pragma scop
+; for (int i = 0; i < 1000; i++) {
+; x = 0;
+; if (control)
+; x += C[i];
+;
+; B[i] = x * A[i];
+; }
+; #pragma endscop
+; }
+;
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @checkPrivatization(i32* %A, i32* %B, i32* %C, i32 %control) {
+entry:
+ br label %entry.split
+
+entry.split: ; preds = %entry
+ br label %for.body
+
+for.body: ; preds = %entry.split, %if.end
+ %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %if.end ]
+ %tobool = icmp eq i32 %control, 0
+ br i1 %tobool, label %if.end, label %if.then
+
+if.then: ; preds = %for.body
+ %arrayidx = getelementptr inbounds i32, i32* %C, i64 %indvars.iv
+ %tmp4 = load i32, i32* %arrayidx, align 4
+ br label %if.end
+
+if.end: ; preds = %for.body, %if.then
+ %x.0 = phi i32 [ %tmp4, %if.then ], [ 0, %for.body ]
+ %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+ %tmp9 = load i32, i32* %arrayidx2, align 4
+ %mul = mul nsw i32 %tmp9, %x.0
+ %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+ store i32 %mul, i32* %arrayidx4, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp ne i64 %indvars.iv.next, 1000
+ br i1 %exitcond, label %for.body, label %for.end
+
+for.end: ; preds = %if.end
+ ret void
+}
OpenPOWER on IntegriCloud