2 files changed, 36 insertions, 2 deletions
diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp
index 9a54c1da67a..7ff8e7c265a 100644
--- a/polly/lib/Transform/ScheduleOptimizer.cpp
+++ b/polly/lib/Transform/ScheduleOptimizer.cpp
@@ -107,6 +107,12 @@ static cl::opt<std::string>
                       cl::desc("Maximize the band depth (yes/no)"), cl::Hidden,
                       cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory));
 
+static cl::opt<int> PrevectorWidth(
+    "polly-prevect-width",
+    cl::desc(
+        "The number of loop iterations to strip-mine for pre-vectorization"),
+    cl::Hidden, cl::init(4), cl::ZeroOrMore, cl::cat(PollyCategory));
+
 static cl::opt<int> DefaultTileSize(
     "polly-default-tile-size",
     cl::desc("The default tile size (if not enough were provided by"
@@ -176,7 +182,7 @@ private:
   /// reason about parallelism.
   static __isl_give isl_schedule_node *
   prevectSchedBand(__isl_take isl_schedule_node *Node, unsigned DimToVectorize,
-                   int VectorWidth = 4);
+                   int VectorWidth);
 
   /// @brief Apply additional optimizations on the bands in the schedule tree.
   ///
@@ -298,7 +304,7 @@ isl_schedule_node *IslScheduleOptimizer::optimizeBand(isl_schedule_node *Node,
 
   for (int i = Dims - 1; i >= 0; i--)
     if (isl_schedule_node_band_member_get_coincident(Node, i)) {
-      Node = IslScheduleOptimizer::prevectSchedBand(Node, i);
+      Node = IslScheduleOptimizer::prevectSchedBand(Node, i, PrevectorWidth);
       break;
     }
 
diff --git a/polly/test/ScheduleOptimizer/prevectorization.ll b/polly/test/ScheduleOptimizer/prevectorization.ll
index 67cab767f1a..fdce15f54c1 100644
--- a/polly/test/ScheduleOptimizer/prevectorization.ll
+++ b/polly/test/ScheduleOptimizer/prevectorization.ll
@@ -1,5 +1,11 @@
 ; RUN: opt -S %loadPolly -polly-detect-unprofitable -basicaa -polly-opt-isl -polly-vectorizer=polly -polly-ast -analyze < %s | FileCheck %s 
 ; RUN: opt -S %loadPolly -polly-detect-unprofitable -basicaa -polly-opt-isl -polly-vectorizer=stripmine -polly-ast -analyze < %s | FileCheck %s
+
+; RUN: opt -S %loadPolly -polly-detect-unprofitable -basicaa -polly-opt-isl \
+; RUN:                   -polly-vectorizer=polly -polly-ast -analyze \
+; RUN:                   -polly-prevect-width=16 < %s | \
+; RUN:                   FileCheck %s -check-prefix=VEC16
+
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 @C = common global [1536 x [1536 x float]] zeroinitializer, align 16
@@ -73,6 +79,28 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
 ; CHECK:             for (int c6 = 0; c6 <= 3; c6 += 1)
 ; CHECK:               Stmt_for_body8(32 * c0 + c3, 32 * c1 + 4 * c4 + c6, 32 * c2 + c5);
 
+; VEC16: {
+; VEC16:   #pragma known-parallel
+; VEC16:   for (int c0 = 0; c0 <= 47; c0 += 1)
+; VEC16:     for (int c1 = 0; c1 <= 47; c1 += 1)
+; VEC16:       for (int c2 = 0; c2 <= 31; c2 += 1)
+; VEC16:         for (int c3 = 0; c3 <= 1; c3 += 1)
+; VEC16:           #pragma simd
+; VEC16:           for (int c4 = 0; c4 <= 15; c4 += 1)
+; VEC16:             Stmt_for_body3(32 * c0 + c2, 32 * c1 + 16 * c3 + c4);
+; VEC16:   #pragma known-parallel
+; VEC16:   for (int c0 = 0; c0 <= 47; c0 += 1)
+; VEC16:     for (int c1 = 0; c1 <= 47; c1 += 1)
+; VEC16:       for (int c2 = 0; c2 <= 47; c2 += 1)
+; VEC16:         for (int c3 = 0; c3 <= 31; c3 += 1)
+; VEC16:           for (int c4 = 0; c4 <= 1; c4 += 1)
+; VEC16:             for (int c5 = 0; c5 <= 31; c5 += 1)
+; VEC16:               #pragma simd
+; VEC16:               for (int c6 = 0; c6 <= 15; c6 += 1)
+; VEC16:                 Stmt_for_body8(32 * c0 + c3, 32 * c1 + 16 * c4 + c6, 32 * c2 + c5);
+; VEC16: }
+
+
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.5.0 "}