Rewrite getPrevectorMap using schedule trees operations

Schedule trees are a lot easier to work with, for both humans and machines. For humans the more structured schedule representation is easier to reason about. Together with the more abstract isl programming interface this can result in a lot cleaner code (see this changeset). For machines, the structured schedule and the fact that we now use explicit piecewise affine expressions instead of integer maps makes it easier to generate code from this schedule tree. As a result, we can already see a slight compile-time improvement -- for 3mm from 0m0.593s to 0m0.551s seconds (-7 %). More importantly, future optimizations such as full-partial tile separation will most likely result in more streamlined code to be generated. Contributed-by: Roman Gareev <gareevroman@gmail.com> llvm-svn: 243458
author: Tobias Grosser <tobias@grosser.es> 2015-07-28 18:03:36 +0000
committer: Tobias Grosser <tobias@grosser.es> 2015-07-28 18:03:36 +0000
commit: b241d928bd884d418445c6c763c4df9ffaa9303f (patch)
tree: 175917bf8e9200692ac8702f4de1cbcd7d4589b9
parent: 305e8f6312f9d461bb9fd1dd73a894d899b77fcc (diff)
download: bcm5719-llvm-b241d928bd884d418445c6c763c4df9ffaa9303f.tar.gz
bcm5719-llvm-b241d928bd884d418445c6c763c4df9ffaa9303f.zip
3 files changed, 53 insertions, 94 deletions
diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp
index 8c35898de56..26b9668d972 100644
--- a/polly/lib/Transform/ScheduleOptimizer.cpp
+++ b/polly/lib/Transform/ScheduleOptimizer.cpp
@@ -118,16 +118,14 @@ private:
   /// @return True, if we believe @p NewSchedule is an improvement for @p S.
   bool isProfitableSchedule(Scop &S, __isl_keep isl_union_map *NewSchedule);
 
-  /// @brief Create a map that pre-vectorizes one scheduling dimension.
+  /// @brief Pre-vectorizes one scheduling dimension of a schedule band.
   ///
-  /// getPrevectorMap creates a map that maps each input dimension to the same
-  /// output dimension, except for the dimension DimToVectorize.
-  /// DimToVectorize is strip mined by 'VectorWidth' and the newly created
-  /// point loop of DimToVectorize is moved to the innermost level.
+  /// prevectSchedBand splits out the dimension DimToVectorize, tiles it and
+  /// sinks the resulting point loop.
   ///
-  /// Example (DimToVectorize=0, ScheduleDimensions=2, VectorWidth=4):
+  /// Example (DimToVectorize=0, VectorWidth=4):
   ///
-  /// | Before transformation
+  /// | Before transformation:
   /// |
   /// | A[i,j] -> [i,j]
   /// |
@@ -135,17 +133,12 @@ private:
   /// |    for (j = 0; j < 128; j++)
   /// |      A(i,j);
   ///
-  ///   Prevector map:
-  ///   [i,j] -> [it,j,ip] : it % 4 = 0 and it <= ip <= it + 3 and i = ip
-  ///
   /// | After transformation:
   /// |
-  /// | A[i,j] -> [it,j,ip] : it % 4 = 0 and it <= ip <= it + 3 and i = ip
-  /// |
-  /// | for (it = 0; it < 128; it+=4)
+  /// | for (it = 0; it < 32; it+=1)
   /// |    for (j = 0; j < 128; j++)
-  /// |      for (ip = max(0,it); ip < min(128, it + 3); ip++)
-  /// |        A(ip,j);
+  /// |      for (ip = 0; ip <= 3; ip++)
+  /// |        A(4 * it + ip,j);
   ///
   /// The goal of this transformation is to create a trivially vectorizable
   /// loop.  This means a parallel loop at the innermost level that has a
@@ -156,9 +149,9 @@ private:
   /// DimToVectorize can be divided by VectorWidth. The default VectorWidth is
   /// currently constant and not yet target specific. This function does not
   /// reason about parallelism.
-  static __isl_give isl_map *getPrevectorMap(isl_ctx *ctx, int DimToVectorize,
-                                             int ScheduleDimensions,
-                                             int VectorWidth = 4);
+  static __isl_give isl_schedule_node *
+  prevectSchedBand(__isl_take isl_schedule_node *Node, unsigned DimToVectorize,
+                   int VectorWidth = 4);
 
   /// @brief Apply additional optimizations on the bands in the schedule tree.
   ///
@@ -170,7 +163,7 @@ private:
   ///      - if the band has more than one loop dimension
   ///
   ///  - Prevectorize the schedule of the band (or the point loop in case of
-  ///    tiling)
+  ///    tiling).
   ///      - if vectorization is enabled
   ///
   /// @param Node The schedule node to (possibly) optimize.
@@ -204,58 +197,33 @@ private:
 
 char IslScheduleOptimizer::ID = 0;
 
-__isl_give isl_map *
-IslScheduleOptimizer::getPrevectorMap(isl_ctx *ctx, int DimToVectorize,
-                                      int ScheduleDimensions, int VectorWidth) {
-  isl_space *Space;
-  isl_local_space *LocalSpace, *LocalSpaceRange;
-  isl_set *Modulo;
-  isl_map *TilingMap;
-  isl_constraint *c;
-  isl_aff *Aff;
-  int PointDimension; /* ip */
-  int TileDimension;  /* it */
-  isl_val *VectorWidthMP;
-
-  assert(0 <= DimToVectorize && DimToVectorize < ScheduleDimensions);
-
-  Space = isl_space_alloc(ctx, 0, ScheduleDimensions, ScheduleDimensions + 1);
-  TilingMap = isl_map_universe(isl_space_copy(Space));
-  LocalSpace = isl_local_space_from_space(Space);
-  PointDimension = ScheduleDimensions;
-  TileDimension = DimToVectorize;
-
-  // Create an identity map for everything except DimToVectorize and map
-  // DimToVectorize to the point loop at the innermost dimension.
-  for (int i = 0; i < ScheduleDimensions; i++)
-    if (i == DimToVectorize)
-      TilingMap =
-          isl_map_equate(TilingMap, isl_dim_in, i, isl_dim_out, PointDimension);
-    else
-      TilingMap = isl_map_equate(TilingMap, isl_dim_in, i, isl_dim_out, i);
-
-  // it % 'VectorWidth' = 0
-  LocalSpaceRange = isl_local_space_range(isl_local_space_copy(LocalSpace));
-  Aff = isl_aff_zero_on_domain(LocalSpaceRange);
-  Aff = isl_aff_set_constant_si(Aff, VectorWidth);
-  Aff = isl_aff_set_coefficient_si(Aff, isl_dim_in, TileDimension, 1);
-  VectorWidthMP = isl_val_int_from_si(ctx, VectorWidth);
-  Aff = isl_aff_mod_val(Aff, VectorWidthMP);
-  Modulo = isl_pw_aff_zero_set(isl_pw_aff_from_aff(Aff));
-  TilingMap = isl_map_intersect_range(TilingMap, Modulo);
-
-  // it <= ip
-  TilingMap = isl_map_order_le(TilingMap, isl_dim_out, TileDimension,
-                               isl_dim_out, PointDimension);
-
-  // ip <= it + ('VectorWidth' - 1)
-  c = isl_inequality_alloc(LocalSpace);
-  isl_constraint_set_coefficient_si(c, isl_dim_out, TileDimension, 1);
-  isl_constraint_set_coefficient_si(c, isl_dim_out, PointDimension, -1);
-  isl_constraint_set_constant_si(c, VectorWidth - 1);
-  TilingMap = isl_map_add_constraint(TilingMap, c);
-
-  return TilingMap;
+__isl_give isl_schedule_node *
+IslScheduleOptimizer::prevectSchedBand(__isl_take isl_schedule_node *Node,
+                                       unsigned DimToVectorize,
+                                       int VectorWidth) {
+  assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
+
+  auto Space = isl_schedule_node_band_get_space(Node);
+  auto ScheduleDimensions = isl_space_dim(Space, isl_dim_set);
+  isl_space_free(Space);
+  assert(DimToVectorize < ScheduleDimensions);
+
+  if (DimToVectorize > 0) {
+    Node = isl_schedule_node_band_split(Node, DimToVectorize);
+    Node = isl_schedule_node_child(Node, 0);
+  }
+  if (DimToVectorize < ScheduleDimensions - 1)
+    Node = isl_schedule_node_band_split(Node, 1);
+  Space = isl_schedule_node_band_get_space(Node);
+  auto Sizes = isl_multi_val_zero(Space);
+  auto Ctx = isl_schedule_node_get_ctx(Node);
+  Sizes =
+      isl_multi_val_set_val(Sizes, 0, isl_val_int_from_si(Ctx, VectorWidth));
+  Node = isl_schedule_node_band_tile(Node, Sizes);
+  Node = isl_schedule_node_child(Node, 0);
+  Node = isl_schedule_node_band_sink(Node);
+  Node = isl_schedule_node_child(Node, 0);
+  return Node;
 }
 
 isl_schedule_node *IslScheduleOptimizer::optimizeBand(isl_schedule_node *Node,
@@ -307,21 +275,12 @@ isl_schedule_node *IslScheduleOptimizer::optimizeBand(isl_schedule_node *Node,
   if (PollyVectorizerChoice == VECTORIZER_NONE)
     return Res;
 
-  auto Schedule = isl_schedule_node_band_get_partial_schedule(Res);
-
-  for (int i = Dims - 1; i >= 0; i--) {
+  for (int i = Dims - 1; i >= 0; i--)
     if (isl_schedule_node_band_member_get_coincident(Res, i)) {
-      auto TileMap = IslScheduleOptimizer::getPrevectorMap(Ctx, i, Dims);
-      auto TileUMap = isl_union_map_from_map(TileMap);
-      auto Schedule2 = isl_union_map_apply_range(
-          isl_union_map_from_multi_union_pw_aff(Schedule), TileUMap);
-      Schedule = isl_multi_union_pw_aff_from_union_map(Schedule2);
+      Res = IslScheduleOptimizer::prevectSchedBand(Res, i);
       break;
     }
-  }
 
-  Res = isl_schedule_node_delete(Res);
-  Res = isl_schedule_node_insert_partial_schedule(Res, Schedule);
   return Res;
 }
 
diff --git a/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll b/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll
index 9cb98080853..d045305fcb3 100644
--- a/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll
+++ b/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll
@@ -55,17 +55,17 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
 
 ; CHECK: #pragma known-parallel
 ; CHECK: for (int c0 = 0; c0 <= 1535; c0 += 1)
-; CHECK:   for (int c1 = 0; c1 <= 1535; c1 += 4)
+; CHECK:   for (int c1 = 0; c1 <= 383; c1 += 1)
 ; CHECK:     #pragma simd
-; CHECK:     for (int c2 = c1; c2 <= c1 + 3; c2 += 1)
-; CHECK:       Stmt_for_body3(c0, c2);
+; CHECK:     for (int c2 = 0; c2 <= 3; c2 += 1)
+; CHECK:       Stmt_for_body3(c0, 4 * c1 + c2);
 ; CHECK: #pragma known-parallel
 ; CHECK: for (int c0 = 0; c0 <= 1535; c0 += 1)
-; CHECK:   for (int c1 = 0; c1 <= 1535; c1 += 4)
+; CHECK:   for (int c1 = 0; c1 <= 383; c1 += 1)
 ; CHECK:     for (int c2 = 0; c2 <= 1535; c2 += 1)
 ; CHECK:       #pragma simd
-; CHECK:       for (int c3 = c1; c3 <= c1 + 3; c3 += 1)
-; CHECK:         Stmt_for_body8(c0, c3, c2);
+; CHECK:       for (int c3 = 0; c3 <= 3; c3 += 1)
+; CHECK:         Stmt_for_body8(c0, 4 * c1 + c3, c2);
 
 !llvm.ident = !{!0}
 
diff --git a/polly/test/ScheduleOptimizer/prevectorization.ll b/polly/test/ScheduleOptimizer/prevectorization.ll
index 5a5ccccb4e0..67cab767f1a 100644
--- a/polly/test/ScheduleOptimizer/prevectorization.ll
+++ b/polly/test/ScheduleOptimizer/prevectorization.ll
@@ -58,20 +58,20 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
 ; CHECK: for (int c0 = 0; c0 <= 47; c0 += 1)
 ; CHECK:   for (int c1 = 0; c1 <= 47; c1 += 1)
 ; CHECK:     for (int c2 = 0; c2 <= 31; c2 += 1)
-; CHECK:       for (int c3 = 0; c3 <= 31; c3 += 4)
+; CHECK:       for (int c3 = 0; c3 <= 7; c3 += 1)
 ; CHECK:         #pragma simd
-; CHECK:         for (int c4 = c3; c4 <= c3 + 3; c4 += 1)
-; CHECK:           Stmt_for_body3(32 * c0 + c2, 32 * c1 + c4);
+; CHECK:         for (int c4 = 0; c4 <= 3; c4 += 1)
+; CHECK:           Stmt_for_body3(32 * c0 + c2, 32 * c1 + 4 * c3 + c4);
 ; CHECK: #pragma known-parallel
 ; CHECK: for (int c0 = 0; c0 <= 47; c0 += 1)
 ; CHECK:   for (int c1 = 0; c1 <= 47; c1 += 1)
 ; CHECK:     for (int c2 = 0; c2 <= 47; c2 += 1)
 ; CHECK:       for (int c3 = 0; c3 <= 31; c3 += 1)
-; CHECK:         for (int c4 = 0; c4 <= 31; c4 += 4)
+; CHECK:         for (int c4 = 0; c4 <= 7; c4 += 1)
 ; CHECK:           for (int c5 = 0; c5 <= 31; c5 += 1)
 ; CHECK:             #pragma simd
-; CHECK:             for (int c6 = c4; c6 <= c4 + 3; c6 += 1)
-; CHECK:               Stmt_for_body8(32 * c0 + c3, 32 * c1 + c6, 32 * c2 + c5);
+; CHECK:             for (int c6 = 0; c6 <= 3; c6 += 1)
+; CHECK:               Stmt_for_body8(32 * c0 + c3, 32 * c1 + 4 * c4 + c6, 32 * c2 + c5);
 
 !llvm.ident = !{!0}
author	Tobias Grosser <tobias@grosser.es>	2015-07-28 18:03:36 +0000
committer	Tobias Grosser <tobias@grosser.es>	2015-07-28 18:03:36 +0000
commit	b241d928bd884d418445c6c763c4df9ffaa9303f (patch)
tree	175917bf8e9200692ac8702f4de1cbcd7d4589b9
parent	305e8f6312f9d461bb9fd1dd73a894d899b77fcc (diff)
download	bcm5719-llvm-b241d928bd884d418445c6c763c4df9ffaa9303f.tar.gz bcm5719-llvm-b241d928bd884d418445c6c763c4df9ffaa9303f.zip