diff options
author | Tobias Grosser <tobias@grosser.es> | 2015-08-19 08:46:11 +0000 |
---|---|---|
committer | Tobias Grosser <tobias@grosser.es> | 2015-08-19 08:46:11 +0000 |
commit | 07c1c2fcc9e7205fcd0813c2e380ec24099baac4 (patch) | |
tree | 8d793b4ee4ee334bb41fe16ee62d747a17efd360 | |
parent | 161c9081e5f09085aa0675751bfd16469826e966 (diff) | |
download | bcm5719-llvm-07c1c2fcc9e7205fcd0813c2e380ec24099baac4.tar.gz bcm5719-llvm-07c1c2fcc9e7205fcd0813c2e380ec24099baac4.zip |
Make prevectorization width configurable
Polly uses 'prevectorization' to enable outer loop vectorization. When
vectorizing an outer loop, we strip-mine <number-of-prevec-dims> loop
iterations which are than interchanged to the innermost level such that LLVM's
inner loop vectorizer (or Polly's simple vectorizer) can easily vectorize this
loop. The number of loop iterations to strip-mine is now configurable with the
option -polly-prevect-width=<number-of-prevec-dims>.
This is mostly a debugging option. We should probably add a heuristic that
derives the number of prevectorization dimensions from the target data and
the data types used.
llvm-svn: 245424
-rw-r--r-- | polly/lib/Transform/ScheduleOptimizer.cpp | 10 | ||||
-rw-r--r-- | polly/test/ScheduleOptimizer/prevectorization.ll | 28 |
2 files changed, 36 insertions, 2 deletions
diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp index 9a54c1da67a..7ff8e7c265a 100644 --- a/polly/lib/Transform/ScheduleOptimizer.cpp +++ b/polly/lib/Transform/ScheduleOptimizer.cpp @@ -107,6 +107,12 @@ static cl::opt<std::string> cl::desc("Maximize the band depth (yes/no)"), cl::Hidden, cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory)); +static cl::opt<int> PrevectorWidth( + "polly-prevect-width", + cl::desc( + "The number of loop iterations to strip-mine for pre-vectorization"), + cl::Hidden, cl::init(4), cl::ZeroOrMore, cl::cat(PollyCategory)); + static cl::opt<int> DefaultTileSize( "polly-default-tile-size", cl::desc("The default tile size (if not enough were provided by" @@ -176,7 +182,7 @@ private: /// reason about parallelism. static __isl_give isl_schedule_node * prevectSchedBand(__isl_take isl_schedule_node *Node, unsigned DimToVectorize, - int VectorWidth = 4); + int VectorWidth); /// @brief Apply additional optimizations on the bands in the schedule tree. /// @@ -298,7 +304,7 @@ isl_schedule_node *IslScheduleOptimizer::optimizeBand(isl_schedule_node *Node, for (int i = Dims - 1; i >= 0; i--) if (isl_schedule_node_band_member_get_coincident(Node, i)) { - Node = IslScheduleOptimizer::prevectSchedBand(Node, i); + Node = IslScheduleOptimizer::prevectSchedBand(Node, i, PrevectorWidth); break; } diff --git a/polly/test/ScheduleOptimizer/prevectorization.ll b/polly/test/ScheduleOptimizer/prevectorization.ll index 67cab767f1a..fdce15f54c1 100644 --- a/polly/test/ScheduleOptimizer/prevectorization.ll +++ b/polly/test/ScheduleOptimizer/prevectorization.ll @@ -1,5 +1,11 @@ ; RUN: opt -S %loadPolly -polly-detect-unprofitable -basicaa -polly-opt-isl -polly-vectorizer=polly -polly-ast -analyze < %s | FileCheck %s ; RUN: opt -S %loadPolly -polly-detect-unprofitable -basicaa -polly-opt-isl -polly-vectorizer=stripmine -polly-ast -analyze < %s | FileCheck %s + +; RUN: opt -S %loadPolly -polly-detect-unprofitable -basicaa -polly-opt-isl \ +; RUN: -polly-vectorizer=polly -polly-ast -analyze \ +; RUN: -polly-prevect-width=16 < %s | \ +; RUN: FileCheck %s -check-prefix=VEC16 + target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @C = common global [1536 x [1536 x float]] zeroinitializer, align 16 @@ -73,6 +79,28 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe ; CHECK: for (int c6 = 0; c6 <= 3; c6 += 1) ; CHECK: Stmt_for_body8(32 * c0 + c3, 32 * c1 + 4 * c4 + c6, 32 * c2 + c5); +; VEC16: { +; VEC16: #pragma known-parallel +; VEC16: for (int c0 = 0; c0 <= 47; c0 += 1) +; VEC16: for (int c1 = 0; c1 <= 47; c1 += 1) +; VEC16: for (int c2 = 0; c2 <= 31; c2 += 1) +; VEC16: for (int c3 = 0; c3 <= 1; c3 += 1) +; VEC16: #pragma simd +; VEC16: for (int c4 = 0; c4 <= 15; c4 += 1) +; VEC16: Stmt_for_body3(32 * c0 + c2, 32 * c1 + 16 * c3 + c4); +; VEC16: #pragma known-parallel +; VEC16: for (int c0 = 0; c0 <= 47; c0 += 1) +; VEC16: for (int c1 = 0; c1 <= 47; c1 += 1) +; VEC16: for (int c2 = 0; c2 <= 47; c2 += 1) +; VEC16: for (int c3 = 0; c3 <= 31; c3 += 1) +; VEC16: for (int c4 = 0; c4 <= 1; c4 += 1) +; VEC16: for (int c5 = 0; c5 <= 31; c5 += 1) +; VEC16: #pragma simd +; VEC16: for (int c6 = 0; c6 <= 15; c6 += 1) +; VEC16: Stmt_for_body8(32 * c0 + c3, 32 * c1 + 16 * c4 + c6, 32 * c2 + c5); +; VEC16: } + + !llvm.ident = !{!0} !0 = !{!"clang version 3.5.0 "} |