summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTobias Grosser <tobias@grosser.es>2015-08-19 08:46:11 +0000
committerTobias Grosser <tobias@grosser.es>2015-08-19 08:46:11 +0000
commit07c1c2fcc9e7205fcd0813c2e380ec24099baac4 (patch)
tree8d793b4ee4ee334bb41fe16ee62d747a17efd360
parent161c9081e5f09085aa0675751bfd16469826e966 (diff)
downloadbcm5719-llvm-07c1c2fcc9e7205fcd0813c2e380ec24099baac4.tar.gz
bcm5719-llvm-07c1c2fcc9e7205fcd0813c2e380ec24099baac4.zip
Make prevectorization width configurable
Polly uses 'prevectorization' to enable outer loop vectorization. When vectorizing an outer loop, we strip-mine <number-of-prevec-dims> loop iterations which are than interchanged to the innermost level such that LLVM's inner loop vectorizer (or Polly's simple vectorizer) can easily vectorize this loop. The number of loop iterations to strip-mine is now configurable with the option -polly-prevect-width=<number-of-prevec-dims>. This is mostly a debugging option. We should probably add a heuristic that derives the number of prevectorization dimensions from the target data and the data types used. llvm-svn: 245424
-rw-r--r--polly/lib/Transform/ScheduleOptimizer.cpp10
-rw-r--r--polly/test/ScheduleOptimizer/prevectorization.ll28
2 files changed, 36 insertions, 2 deletions
diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp
index 9a54c1da67a..7ff8e7c265a 100644
--- a/polly/lib/Transform/ScheduleOptimizer.cpp
+++ b/polly/lib/Transform/ScheduleOptimizer.cpp
@@ -107,6 +107,12 @@ static cl::opt<std::string>
cl::desc("Maximize the band depth (yes/no)"), cl::Hidden,
cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory));
+static cl::opt<int> PrevectorWidth(
+ "polly-prevect-width",
+ cl::desc(
+ "The number of loop iterations to strip-mine for pre-vectorization"),
+ cl::Hidden, cl::init(4), cl::ZeroOrMore, cl::cat(PollyCategory));
+
static cl::opt<int> DefaultTileSize(
"polly-default-tile-size",
cl::desc("The default tile size (if not enough were provided by"
@@ -176,7 +182,7 @@ private:
/// reason about parallelism.
static __isl_give isl_schedule_node *
prevectSchedBand(__isl_take isl_schedule_node *Node, unsigned DimToVectorize,
- int VectorWidth = 4);
+ int VectorWidth);
/// @brief Apply additional optimizations on the bands in the schedule tree.
///
@@ -298,7 +304,7 @@ isl_schedule_node *IslScheduleOptimizer::optimizeBand(isl_schedule_node *Node,
for (int i = Dims - 1; i >= 0; i--)
if (isl_schedule_node_band_member_get_coincident(Node, i)) {
- Node = IslScheduleOptimizer::prevectSchedBand(Node, i);
+ Node = IslScheduleOptimizer::prevectSchedBand(Node, i, PrevectorWidth);
break;
}
diff --git a/polly/test/ScheduleOptimizer/prevectorization.ll b/polly/test/ScheduleOptimizer/prevectorization.ll
index 67cab767f1a..fdce15f54c1 100644
--- a/polly/test/ScheduleOptimizer/prevectorization.ll
+++ b/polly/test/ScheduleOptimizer/prevectorization.ll
@@ -1,5 +1,11 @@
; RUN: opt -S %loadPolly -polly-detect-unprofitable -basicaa -polly-opt-isl -polly-vectorizer=polly -polly-ast -analyze < %s | FileCheck %s
; RUN: opt -S %loadPolly -polly-detect-unprofitable -basicaa -polly-opt-isl -polly-vectorizer=stripmine -polly-ast -analyze < %s | FileCheck %s
+
+; RUN: opt -S %loadPolly -polly-detect-unprofitable -basicaa -polly-opt-isl \
+; RUN: -polly-vectorizer=polly -polly-ast -analyze \
+; RUN: -polly-prevect-width=16 < %s | \
+; RUN: FileCheck %s -check-prefix=VEC16
+
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@C = common global [1536 x [1536 x float]] zeroinitializer, align 16
@@ -73,6 +79,28 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
; CHECK: for (int c6 = 0; c6 <= 3; c6 += 1)
; CHECK: Stmt_for_body8(32 * c0 + c3, 32 * c1 + 4 * c4 + c6, 32 * c2 + c5);
+; VEC16: {
+; VEC16: #pragma known-parallel
+; VEC16: for (int c0 = 0; c0 <= 47; c0 += 1)
+; VEC16: for (int c1 = 0; c1 <= 47; c1 += 1)
+; VEC16: for (int c2 = 0; c2 <= 31; c2 += 1)
+; VEC16: for (int c3 = 0; c3 <= 1; c3 += 1)
+; VEC16: #pragma simd
+; VEC16: for (int c4 = 0; c4 <= 15; c4 += 1)
+; VEC16: Stmt_for_body3(32 * c0 + c2, 32 * c1 + 16 * c3 + c4);
+; VEC16: #pragma known-parallel
+; VEC16: for (int c0 = 0; c0 <= 47; c0 += 1)
+; VEC16: for (int c1 = 0; c1 <= 47; c1 += 1)
+; VEC16: for (int c2 = 0; c2 <= 47; c2 += 1)
+; VEC16: for (int c3 = 0; c3 <= 31; c3 += 1)
+; VEC16: for (int c4 = 0; c4 <= 1; c4 += 1)
+; VEC16: for (int c5 = 0; c5 <= 31; c5 += 1)
+; VEC16: #pragma simd
+; VEC16: for (int c6 = 0; c6 <= 15; c6 += 1)
+; VEC16: Stmt_for_body8(32 * c0 + c3, 32 * c1 + 16 * c4 + c6, 32 * c2 + c5);
+; VEC16: }
+
+
!llvm.ident = !{!0}
!0 = !{!"clang version 3.5.0 "}
OpenPOWER on IntegriCloud