5 files changed, 243 insertions, 13 deletions
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
index eb46d453550..b882455c620 100644
--- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp
+++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
@@ -112,6 +112,111 @@ static cl::opt<int>
                cl::desc("Minimal number of compute statements to run on GPU."),
                cl::Hidden, cl::init(10 * 512 * 512));
 
+/// Used to store information PPCG wants for kills. This information is
+/// used by live range reordering.
+///
+/// @see computeLiveRangeReordering
+/// @see GPUNodeBuilder::createPPCGScop
+/// @see GPUNodeBuilder::createPPCGProg
+struct MustKillsInfo {
+  /// Collection of all kill statements that will be sequenced at the end of
+  /// PPCGScop->schedule.
+  ///
+  /// The nodes in `KillsSchedule` will be merged using `isl_schedule_set`
+  /// which merges schedules in *arbitrary* order.
+  /// (we don't care about the order of the kills anyway).
+  isl::schedule KillsSchedule;
+  /// Map from kill statement instances to scalars that need to be
+  /// killed.
+  ///
+  /// We currently only derive kill information for phi nodes, as phi nodes
+  /// allow us to easily derive kill information. PHI nodes are not alive
+  /// outside the scop and can consequently all be "killed". [params] -> {
+  /// [Stmt_phantom[] -> ref_phantom[]] -> phi_ref[] }
+  isl::union_map TaggedMustKills;
+
+  MustKillsInfo() : KillsSchedule(nullptr), TaggedMustKills(nullptr){};
+};
+
+/// Compute must-kills needed to enable live range reordering with PPCG.
+///
+/// @params S The Scop to compute live range reordering information
+/// @returns live range reordering information that can be used to setup
+/// PPCG.
+static MustKillsInfo computeMustKillsInfo(const Scop &S) {
+  const isl::space ParamSpace(isl::manage(S.getParamSpace()));
+  MustKillsInfo Info;
+
+  // 1. Collect phi nodes in scop.
+  SmallVector<isl::id, 4> KillMemIds;
+  for (ScopArrayInfo *SAI : S.arrays()) {
+    if (!SAI->isPHIKind())
+      continue;
+
+    KillMemIds.push_back(isl::manage(SAI->getBasePtrId()));
+  }
+
+  Info.TaggedMustKills = isl::union_map::empty(isl::space(ParamSpace));
+
+  // Initialising KillsSchedule to `isl_set_empty` creates an empty node in the
+  // schedule:
+  //     - filter: "[control] -> { }"
+  // So, we choose to not create this to keep the output a little nicer,
+  // at the cost of some code complexity.
+  Info.KillsSchedule = nullptr;
+
+  for (isl::id &phiId : KillMemIds) {
+    isl::id KillStmtId = isl::id::alloc(
+        S.getIslCtx(), std::string("SKill_phantom_").append(phiId.get_name()),
+        nullptr);
+
+    // NOTE: construction of tagged_must_kill:
+    // 2. We need to construct a map:
+    //     [param] -> { [Stmt_phantom[] -> ref_phantom[]] -> phi_ref }
+    // To construct this, we use `isl_map_domain_product` on 2 maps`:
+    // 2a. StmtToPhi:
+    //         [param] -> { Stmt_phantom[] -> phi_ref[] }
+    // 2b. PhantomRefToPhi:
+    //         [param] -> { ref_phantom[] -> phi_ref[] }
+    //
+    // Combining these with `isl_map_domain_product` gives us
+    // TaggedMustKill:
+    //     [param] -> { [Stmt[] -> phantom_ref[]] -> memref[] }
+
+    // 2a. [param] -> { S_2[] -> phi_ref[] }
+    isl::map StmtToPhi = isl::map::universe(isl::space(ParamSpace));
+    StmtToPhi = StmtToPhi.set_tuple_id(isl::dim::in, isl::id(KillStmtId));
+    StmtToPhi = StmtToPhi.set_tuple_id(isl::dim::out, isl::id(phiId));
+
+    isl::id PhantomRefId = isl::id::alloc(
+        S.getIslCtx(), std::string("ref_phantom") + phiId.get_name(), nullptr);
+
+    // 2b. [param] -> { phantom_ref[] -> memref[] }
+    isl::map PhantomRefToPhi = isl::map::universe(isl::space(ParamSpace));
+    PhantomRefToPhi = PhantomRefToPhi.set_tuple_id(isl::dim::in, PhantomRefId);
+    PhantomRefToPhi = PhantomRefToPhi.set_tuple_id(isl::dim::out, phiId);
+
+    // 2. [param] -> { [Stmt[] -> phantom_ref[]] -> memref[] }
+    isl::map TaggedMustKill = StmtToPhi.domain_product(PhantomRefToPhi);
+    Info.TaggedMustKills = Info.TaggedMustKills.unite(TaggedMustKill);
+
+    // 3. Create the kill schedule of the form:
+    //     "[param] -> { Stmt_phantom[] }"
+    // Then add this to Info.KillsSchedule.
+    isl::space KillStmtSpace = ParamSpace;
+    KillStmtSpace = KillStmtSpace.set_tuple_id(isl::dim::set, KillStmtId);
+    isl::union_set KillStmtDomain = isl::set::universe(KillStmtSpace);
+
+    isl::schedule KillSchedule = isl::schedule::from_domain(KillStmtDomain);
+    if (Info.KillsSchedule)
+      Info.KillsSchedule = Info.KillsSchedule.set(KillSchedule);
+    else
+      Info.KillsSchedule = KillSchedule;
+  }
+
+  return Info;
+}
+
 /// Create the ast expressions for a ScopStmt.
 ///
 /// This function is a callback for to generate the ast expressions for each
@@ -2114,6 +2219,8 @@ public:
     auto PPCGScop = (ppcg_scop *)malloc(sizeof(ppcg_scop));
 
     PPCGScop->options = createPPCGOptions();
+    // enable live range reordering
+    PPCGScop->options->live_range_reordering = 1;
 
     PPCGScop->start = 0;
     PPCGScop->end = 0;
@@ -2129,10 +2236,9 @@ public:
     PPCGScop->tagged_must_writes = getTaggedMustWrites();
     PPCGScop->must_writes = S->getMustWrites();
     PPCGScop->live_out = nullptr;
-    PPCGScop->tagged_must_kills = isl_union_map_empty(S->getParamSpace());
     PPCGScop->tagger = nullptr;
-
-    PPCGScop->independence = nullptr;
+    PPCGScop->independence =
+        isl_union_map_empty(isl_set_get_space(PPCGScop->context));
     PPCGScop->dep_flow = nullptr;
     PPCGScop->tagged_dep_flow = nullptr;
     PPCGScop->dep_false = nullptr;
@@ -2141,8 +2247,15 @@ public:
     PPCGScop->tagged_dep_order = nullptr;
 
     PPCGScop->schedule = S->getScheduleTree();
-    PPCGScop->names = getNames();
 
+    MustKillsInfo KillsInfo = computeMustKillsInfo(*S);
+    // If we have something non-trivial to kill, add it to the schedule
+    if (KillsInfo.KillsSchedule.get())
+      PPCGScop->schedule = isl_schedule_sequence(
+          PPCGScop->schedule, KillsInfo.KillsSchedule.take());
+    PPCGScop->tagged_must_kills = KillsInfo.TaggedMustKills.take();
+
+    PPCGScop->names = getNames();
     PPCGScop->pet = nullptr;
 
     compute_tagger(PPCGScop);
@@ -2414,7 +2527,13 @@ public:
     PPCGProg->to_inner = getArrayIdentity();
     PPCGProg->to_outer = getArrayIdentity();
     PPCGProg->any_to_outer = nullptr;
-    PPCGProg->array_order = nullptr;
+
+    // this needs to be set when live range reordering is enabled.
+    // NOTE: I believe that is conservatively correct. I'm not sure
+    //       what the semantics of this is.
+    // Quoting PPCG/gpu.h: "Order dependences on non-scalars."
+    PPCGProg->array_order =
+        isl_union_map_empty(isl_set_get_space(PPCGScop->context));
     PPCGProg->n_stmts = std::distance(S->begin(), S->end());
     PPCGProg->stmts = getStatements();
     PPCGProg->n_array = std::distance(S->array_begin(), S->array_end());
@@ -2424,7 +2543,6 @@ public:
     createArrays(PPCGProg);
 
     PPCGProg->may_persist = compute_may_persist(PPCGProg);
-
     return PPCGProg;
   }
 
diff --git a/polly/test/GPGPU/non-read-only-scalars.ll b/polly/test/GPGPU/non-read-only-scalars.ll
index ba2674c84aa..465acaf48b5 100644
--- a/polly/test/GPGPU/non-read-only-scalars.ll
+++ b/polly/test/GPGPU/non-read-only-scalars.ll
@@ -67,7 +67,6 @@
 
 ; CODE:   }
 ; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (32) * sizeof(float), cudaMemcpyDeviceToHost));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(&MemRef_sum_0__phi, dev_MemRef_sum_0__phi, sizeof(float), cudaMemcpyDeviceToHost));
 ; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(&MemRef_sum_0, dev_MemRef_sum_0, sizeof(float), cudaMemcpyDeviceToHost));
 ; CODE-NEXT: }
 
diff --git a/polly/test/GPGPU/phi-nodes-in-kernel.ll b/polly/test/GPGPU/phi-nodes-in-kernel.ll
index e2780cb2576..eb3203029f8 100644
--- a/polly/test/GPGPU/phi-nodes-in-kernel.ll
+++ b/polly/test/GPGPU/phi-nodes-in-kernel.ll
@@ -24,9 +24,8 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CODE-NEXT:     cudaCheckKernel();
 ; CODE-NEXT:   }
 
-; CODE:   cudaCheckReturn(cudaMemcpy(&MemRef_out_l_055__phi, dev_MemRef_out_l_055__phi, sizeof(i32), cudaMemcpyDeviceToHost));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(&MemRef_out_l_055, dev_MemRef_out_l_055, sizeof(i32), cudaMemcpyDeviceToHost));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(MemRef_c, dev_MemRef_c, (50) * sizeof(i32), cudaMemcpyDeviceToHost));
+; CODE:       cudaCheckReturn(cudaMemcpy(&MemRef_out_l_055, dev_MemRef_out_l_055, sizeof(i32), cudaMemcpyDeviceToHost));
+; CODE-NEXT:  cudaCheckReturn(cudaMemcpy(MemRef_c, dev_MemRef_c, (50) * sizeof(i32), cudaMemcpyDeviceToHost));
 ; CODE-NEXT: }
 
 ; CODE: # kernel0
@@ -41,9 +40,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; IR:      [[REGA:%.+]] = bitcast i32* %out_l.055.phiops to i8*
 ; IR-NEXT: call void @polly_copyFromHostToDevice(i8* [[REGA]], i8* %p_dev_array_MemRef_out_l_055__phi, i64 4)
 
-; IR:      [[REGB:%.+]] = bitcast i32* %out_l.055.phiops to i8*
-; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055__phi, i8* [[REGB]], i64 4)
-; IR-NEXT: [[REGC:%.+]] = bitcast i32* %out_l.055.s2a to i8*
+; IR: [[REGC:%.+]] = bitcast i32* %out_l.055.s2a to i8*
 ; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055, i8* [[REGC]], i64 4)
 
 ; KERNEL-IR: entry:
diff --git a/polly/test/GPGPU/privatization-simple.ll b/polly/test/GPGPU/privatization-simple.ll
new file mode 100644
index 00000000000..a6484379e5d
--- /dev/null
+++ b/polly/test/GPGPU/privatization-simple.ll
@@ -0,0 +1,56 @@
+; RUN: opt %loadPolly -analyze -polly-scops < %s | FileCheck %s -check-prefix=SCOP
+; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR
+
+; SCOP:      Function: f
+; SCOP-NEXT: Region: %for.body---%for.end
+; SCOP-NEXT: Max Loop Depth:  1
+
+; Check that kernel launch is generated in host IR.
+; the declare would not be generated unless a call to a kernel exists.
+; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
+
+; void f(int A[], int B[], int control, int C[]) {
+;     int x;
+; #pragma scop
+;     for(int i = 0; i < 1000; i ++) {
+;         x = 0;
+;         if(control) x = C[i];
+;         B[i] = x * A[i];
+; 
+;     }
+; #pragma endscop
+; }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @f(i32* %A, i32* %B, i32 %control, i32* %C) {
+entry:
+  br label %entry.split
+
+entry.split:                                      ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %entry.split, %if.end
+  %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %if.end ]
+  %tobool = icmp eq i32 %control, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32* %C, i64 %indvars.iv
+  %tmp4 = load i32, i32* %arrayidx, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %for.body, %if.then
+  %x.0 = phi i32 [ %tmp4, %if.then ], [ 0, %for.body ]
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %tmp8 = load i32, i32* %arrayidx2, align 4
+  %mul = mul nsw i32 %tmp8, %x.0
+  %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %if.end
+  ret void
+}
diff --git a/polly/test/GPGPU/privatization.ll b/polly/test/GPGPU/privatization.ll
new file mode 100644
index 00000000000..f2a4ef238bf
--- /dev/null
+++ b/polly/test/GPGPU/privatization.ll
@@ -0,0 +1,60 @@
+; RUN: opt %loadPolly -analyze -polly-scops < %s | FileCheck %s -check-prefix=SCOP
+; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR
+
+; SCOP:      Function: checkPrivatization
+; SCOP-NEXT: Region: %for.body---%for.end
+; SCOP-NEXT: Max Loop Depth:  1
+
+
+; Check that kernel launch is generated in host IR.
+; the declare would not be generated unless a call to a kernel exists.
+; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
+
+; 
+;
+;    void checkPrivatization(int A[], int B[], int C[], int control) {
+;      int x;
+;    #pragma scop
+;      for (int i = 0; i < 1000; i++) {
+;        x = 0;
+;        if (control)
+;          x += C[i];
+;
+;        B[i] = x * A[i];
+;      }
+;    #pragma endscop
+;    }
+;
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @checkPrivatization(i32* %A, i32* %B, i32* %C, i32 %control) {
+entry:
+  br label %entry.split
+
+entry.split:                                      ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %entry.split, %if.end
+  %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %if.end ]
+  %tobool = icmp eq i32 %control, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32* %C, i64 %indvars.iv
+  %tmp4 = load i32, i32* %arrayidx, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %for.body, %if.then
+  %x.0 = phi i32 [ %tmp4, %if.then ], [ 0, %for.body ]
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %tmp9 = load i32, i32* %arrayidx2, align 4
+  %mul = mul nsw i32 %tmp9, %x.0
+  %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %if.end
+  ret void
+}