2 files changed, 84 insertions, 10 deletions
diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp
index 43ed2785095..5d8ac5a0922 100644
--- a/polly/lib/Transform/ScheduleOptimizer.cpp
+++ b/polly/lib/Transform/ScheduleOptimizer.cpp
@@ -169,7 +169,8 @@ private:
   ///      - if the band is tileable
   ///      - if the band has more than one loop dimension
   ///
-  ///  - Prevectorize the point loop of the tile
+  ///  - Prevectorize the schedule of the band (or the point loop in case of
+  ///    tiling)
   ///      - if vectorization is enabled
   ///
   /// @param Node The schedule node to (possibly) optimize.
@@ -300,28 +301,29 @@ isl_schedule_node *IslScheduleOptimizer::optimizeBand(isl_schedule_node *Node,
     Res = Node;
   } else {
     Res = isl_schedule_node_band_tile(Node, Sizes);
+    Child = isl_schedule_node_get_child(Res, 0);
+    isl_schedule_node_free(Res);
+    Res = Child;
   }
 
   if (PollyVectorizerChoice == VECTORIZER_NONE)
     return Res;
 
-  Child = isl_schedule_node_get_child(Res, 0);
-  auto ChildSchedule = isl_schedule_node_band_get_partial_schedule(Child);
+  auto Schedule = isl_schedule_node_band_get_partial_schedule(Res);
 
   for (int i = Dims - 1; i >= 0; i--) {
-    if (isl_schedule_node_band_member_get_coincident(Child, i)) {
+    if (isl_schedule_node_band_member_get_coincident(Res, i)) {
       auto TileMap = IslScheduleOptimizer::getPrevectorMap(Ctx, i, Dims);
       auto TileUMap = isl_union_map_from_map(TileMap);
-      auto ChildSchedule2 = isl_union_map_apply_range(
-          isl_union_map_from_multi_union_pw_aff(ChildSchedule), TileUMap);
-      ChildSchedule = isl_multi_union_pw_aff_from_union_map(ChildSchedule2);
+      auto Schedule2 = isl_union_map_apply_range(
+          isl_union_map_from_multi_union_pw_aff(Schedule), TileUMap);
+      Schedule = isl_multi_union_pw_aff_from_union_map(Schedule2);
       break;
     }
   }
 
-  isl_schedule_node_free(Res);
-  Res = isl_schedule_node_delete(Child);
-  Res = isl_schedule_node_insert_partial_schedule(Res, ChildSchedule);
+  Res = isl_schedule_node_delete(Res);
+  Res = isl_schedule_node_insert_partial_schedule(Res, Schedule);
   return Res;
 }
 
diff --git a/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll b/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll
new file mode 100644
index 00000000000..9cb98080853
--- /dev/null
+++ b/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll
@@ -0,0 +1,72 @@
+; RUN: opt -S %loadPolly -polly-detect-unprofitable -basicaa -polly-opt-isl -polly-no-tiling -polly-vectorizer=polly -polly-ast -analyze < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+@C = common global [1536 x [1536 x float]] zeroinitializer, align 16
+@A = common global [1536 x [1536 x float]] zeroinitializer, align 16
+@B = common global [1536 x [1536 x float]] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define void @foo() #0 {
+entry:
+  br label %entry.split
+
+entry.split:                                      ; preds = %entry
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry.split, %for.inc28
+  %indvar4 = phi i64 [ 0, %entry.split ], [ %indvar.next5, %for.inc28 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.inc25
+  %indvar6 = phi i64 [ 0, %for.cond1.preheader ], [ %indvar.next7, %for.inc25 ]
+  %arrayidx24 = getelementptr [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %indvar4, i64 %indvar6
+  store float 0.000000e+00, float* %arrayidx24, align 4
+  br label %for.body8
+
+for.body8:                                        ; preds = %for.body3, %for.body8
+  %indvar = phi i64 [ 0, %for.body3 ], [ %indvar.next, %for.body8 ]
+  %arrayidx16 = getelementptr [1536 x [1536 x float]], [1536 x [1536 x float]]* @A, i64 0, i64 %indvar4, i64 %indvar
+  %arrayidx20 = getelementptr [1536 x [1536 x float]], [1536 x [1536 x float]]* @B, i64 0, i64 %indvar, i64 %indvar6
+  %0 = load float, float* %arrayidx24, align 4
+  %1 = load float, float* %arrayidx16, align 4
+  %2 = load float, float* %arrayidx20, align 4
+  %mul = fmul float %1, %2
+  %add = fadd float %0, %mul
+  store float %add, float* %arrayidx24, align 4
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp ne i64 %indvar.next, 1536
+  br i1 %exitcond, label %for.body8, label %for.inc25
+
+for.inc25:                                        ; preds = %for.body8
+  %indvar.next7 = add i64 %indvar6, 1
+  %exitcond8 = icmp ne i64 %indvar.next7, 1536
+  br i1 %exitcond8, label %for.body3, label %for.inc28
+
+for.inc28:                                        ; preds = %for.inc25
+  %indvar.next5 = add i64 %indvar4, 1
+  %exitcond9 = icmp ne i64 %indvar.next5, 1536
+  br i1 %exitcond9, label %for.cond1.preheader, label %for.end30
+
+for.end30:                                        ; preds = %for.inc28
+  ret void
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+; CHECK: #pragma known-parallel
+; CHECK: for (int c0 = 0; c0 <= 1535; c0 += 1)
+; CHECK:   for (int c1 = 0; c1 <= 1535; c1 += 4)
+; CHECK:     #pragma simd
+; CHECK:     for (int c2 = c1; c2 <= c1 + 3; c2 += 1)
+; CHECK:       Stmt_for_body3(c0, c2);
+; CHECK: #pragma known-parallel
+; CHECK: for (int c0 = 0; c0 <= 1535; c0 += 1)
+; CHECK:   for (int c1 = 0; c1 <= 1535; c1 += 4)
+; CHECK:     for (int c2 = 0; c2 <= 1535; c2 += 1)
+; CHECK:       #pragma simd
+; CHECK:       for (int c3 = c1; c3 <= c1 + 3; c3 += 1)
+; CHECK:         Stmt_for_body8(c0, c3, c2);
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.5.0 "}