summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRoman Gareev <gareevroman@gmail.com>2016-07-25 09:42:53 +0000
committerRoman Gareev <gareevroman@gmail.com>2016-07-25 09:42:53 +0000
commit3a18a931a8f47879c4b246600a2bd59ca4f61c82 (patch)
treec145b0d3c80f4f6acb23c00b91a3af4964fbc4b1
parent73f5c785c103edb05a5619c30eff040511816561 (diff)
downloadbcm5719-llvm-3a18a931a8f47879c4b246600a2bd59ca4f61c82.tar.gz
bcm5719-llvm-3a18a931a8f47879c4b246600a2bd59ca4f61c82.zip
Apply all necessary tilings and interchangings to get a macro-kernel
This is the second patch to apply the BLIS matmul optimization pattern on matmul kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf). BLIS implements gemm as three nested loops around a macro-kernel, plus two packing routines. The macro-kernel is implemented in terms of two additional loops around a micro-kernel. The micro-kernel is a loop around a rank-1 (i.e., outer product) update. In this change we create the BLIS macro-kernel by applying a combination of tiling and interchanging. In subsequent changes we will implement the packing transformation. Reviewed-by: Tobias Grosser <tobias@grosser.es> Differential Revision: http://reviews.llvm.org/D21491 llvm-svn: 276627
-rw-r--r--polly/include/polly/ScheduleOptimizer.h54
-rw-r--r--polly/lib/Transform/ScheduleOptimizer.cpp95
-rw-r--r--polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll51
3 files changed, 186 insertions, 14 deletions
diff --git a/polly/include/polly/ScheduleOptimizer.h b/polly/include/polly/ScheduleOptimizer.h
index f41fe1d99db..38ecb011c8d 100644
--- a/polly/include/polly/ScheduleOptimizer.h
+++ b/polly/include/polly/ScheduleOptimizer.h
@@ -30,6 +30,17 @@ struct MicroKernelParamsTy {
int Nr;
};
+/// @brief Parameters of the macro kernel.
+///
+/// Parameters, which determine sizes of blocks of partitioned matrices
+/// used in the optimized matrix multiplication.
+///
+struct MacroKernelParamsTy {
+ int Mc;
+ int Nc;
+ int Kc;
+};
+
namespace polly {
extern bool DisablePollyTiling;
class Scop;
@@ -115,10 +126,10 @@ private:
applyRegisterTiling(__isl_take isl_schedule_node *Node,
llvm::ArrayRef<int> TileSizes, int DefaultTileSize);
- /// @brief Apply the BLIS matmul optimization pattern
+ /// @brief Apply the BLIS matmul optimization pattern.
///
- /// Apply the BLIS matmul optimization pattern. BLIS implements gemm
- /// as three nested loops around a macro-kernel, plus two packing routines.
+ /// Apply the BLIS matmul optimization pattern. BLIS implements gemm as three
+ /// nested loops around a macro-kernel, plus two packing routines.
/// The macro-kernel is implemented in terms of two additional loops around
/// a micro-kernel. The micro-kernel is a loop around a rank-1
/// (i.e., outer product) update.
@@ -129,19 +140,20 @@ private:
/// Technical Report, 2014
/// http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
///
- /// We create the BLIS micro-kernel by applying a combination of tiling
- /// and unrolling. In subsequent changes we will add the extraction
- /// of the BLIS macro-kernel and implement the packing transformation.
+ /// In our case matrices are stored in row-major order, which is taken into
+ /// account during the creation of the BLIS kernels and the computation
+ /// of their parameters.
///
- /// It is assumed that the Node is successfully checked
- /// by ScheduleTreeOptimizer::isMatrMultPattern. Consequently
- /// in case of matmul kernels the application of optimizeMatMulPattern
- /// can lead to close-to-peak performance. Maybe it can be generalized
- /// to effectively optimize the whole class of successfully checked
- /// statements.
+ /// @see ScheduleTreeOptimizer::createMicroKernel
+ /// @see ScheduleTreeOptimizer::createMacroKernel
+ /// @see getMicroKernelParams
+ /// @see getMacroKernelParams
///
- /// @param Node the node that contains a band to be optimized.
- /// @return Modified isl_schedule_node.
+ /// TODO: Implement the packing transformation.
+ ///
+ /// @param Node The node that contains a band to be optimized. The node
+ /// is required to successfully pass
+ /// ScheduleTreeOptimizer::isMatrMultPattern.
static __isl_give isl_schedule_node *
optimizeMatMulPattern(__isl_take isl_schedule_node *Node,
const llvm::TargetTransformInfo *TTI);
@@ -247,6 +259,20 @@ private:
///
/// We create the BLIS macro-kernel by applying a combination of tiling
/// of dimensions of the band node and interchanging of two innermost
+ /// modified dimensions. The values of of MacroKernelParams's fields are used
+ /// as tile sizes.
+ ///
+ /// @param Node The schedule node to be modified.
+ /// @param MacroKernelParams Parameters of the macro kernel
+ /// to be used as tile sizes.
+ static __isl_give isl_schedule_node *
+ createMacroKernel(__isl_take isl_schedule_node *Node,
+ MacroKernelParamsTy MacroKernelParams);
+
+ /// @brief Create the BLIS macro-kernel.
+ ///
+ /// We create the BLIS macro-kernel by applying a combination of tiling
+ /// of dimensions of the band node and interchanging of two innermost
/// modified dimensions. The values passed in MicroKernelParam are used
/// as tile sizes.
///
diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp
index e1689e54aac..acecf9dda5e 100644
--- a/polly/lib/Transform/ScheduleOptimizer.cpp
+++ b/polly/lib/Transform/ScheduleOptimizer.cpp
@@ -134,6 +134,17 @@ static cl::opt<int> ThrougputVectorFma(
"instructions per clock cycle."),
cl::Hidden, cl::init(1), cl::ZeroOrMore, cl::cat(PollyCategory));
+static cl::list<int>
+ CacheLevelAssociativity("polly-target-cache-level-associativity",
+ cl::desc("The associativity of each cache level."),
+ cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
+ cl::cat(PollyCategory));
+
+static cl::list<int> CacheLevelSizes(
+ "polly-target-cache-level-sizes",
+ cl::desc("The size of each cache level specified in bytes."), cl::Hidden,
+ cl::ZeroOrMore, cl::CommaSeparated, cl::cat(PollyCategory));
+
static cl::opt<int> FirstLevelDefaultTileSize(
"polly-default-tile-size",
cl::desc("The default tile size (if not enough were provided by"
@@ -493,12 +504,52 @@ static __isl_give isl_map *circularShiftOutputDims(__isl_take isl_map *IslMap) {
return isl_map_set_tuple_id(IslMap, isl_dim_in, InputDimsId);
}
+/// @brief Permute two dimensions of the band node.
+///
+/// Permute FirstDim and SecondDim dimensions of the Node.
+///
+/// @param Node The band node to be modified.
+/// @param FirstDim The first dimension to be permuted.
+/// @param SecondDim The second dimension to be permuted.
+static __isl_give isl_schedule_node *
+permuteBandNodeDimensions(__isl_take isl_schedule_node *Node, unsigned FirstDim,
+ unsigned SecondDim) {
+ assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band &&
+ isl_schedule_node_band_n_member(Node) > std::max(FirstDim, SecondDim));
+ auto PartialSchedule = isl_schedule_node_band_get_partial_schedule(Node);
+ auto PartialScheduleFirstDim =
+ isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, FirstDim);
+ auto PartialScheduleSecondDim =
+ isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, SecondDim);
+ PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff(
+ PartialSchedule, SecondDim, PartialScheduleFirstDim);
+ PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff(
+ PartialSchedule, FirstDim, PartialScheduleSecondDim);
+ Node = isl_schedule_node_delete(Node);
+ Node = isl_schedule_node_insert_partial_schedule(Node, PartialSchedule);
+ return Node;
+}
+
__isl_give isl_schedule_node *ScheduleTreeOptimizer::createMicroKernel(
__isl_take isl_schedule_node *Node, MicroKernelParamsTy MicroKernelParams) {
return applyRegisterTiling(Node, {MicroKernelParams.Mr, MicroKernelParams.Nr},
1);
}
+__isl_give isl_schedule_node *ScheduleTreeOptimizer::createMacroKernel(
+ __isl_take isl_schedule_node *Node, MacroKernelParamsTy MacroKernelParams) {
+ assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
+ if (MacroKernelParams.Mc == 1 && MacroKernelParams.Nc == 1 &&
+ MacroKernelParams.Kc == 1)
+ return Node;
+ Node = tileNode(
+ Node, "1st level tiling",
+ {MacroKernelParams.Mc, MacroKernelParams.Nc, MacroKernelParams.Kc}, 1);
+ Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
+ Node = permuteBandNodeDimensions(Node, 1, 2);
+ return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
+}
+
/// Get parameters of the BLIS micro kernel.
///
/// We choose the Mr and Nr parameters of the micro kernel to be large enough
@@ -525,10 +576,54 @@ getMicroKernelParams(const llvm::TargetTransformInfo *TTI) {
return {Mr, Nr};
}
+/// Get parameters of the BLIS macro kernel.
+///
+/// During the computation of matrix multiplication, blocks of partitioned
+/// matrices are mapped to different layers of the memory hierarchy.
+/// To optimize data reuse, blocks should be ideally kept in cache between
+/// iterations. Since parameters of the macro kernel determine sizes of these
+/// blocks, there are upper and lower bounds on these parameters.
+///
+/// @param MicroKernelParams Parameters of the micro-kernel
+/// to be taken into account.
+/// @return The structure of type MacroKernelParamsTy.
+/// @see MacroKernelParamsTy
+/// @see MicroKernelParamsTy
+static struct MacroKernelParamsTy
+getMacroKernelParams(const MicroKernelParamsTy &MicroKernelParams) {
+ // According to www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf,
+ // it requires information about the first two levels of a cache to determine
+ // all the parameters of a macro-kernel. It also checks that an associativity
+ // degree of a cache level is greater than two. Otherwise, another algorithm
+ // for determination of the parameters should be used.
+ if (!(MicroKernelParams.Mr > 0 && MicroKernelParams.Nr > 0 &&
+ CacheLevelSizes.size() >= 2 && CacheLevelAssociativity.size() >= 2 &&
+ CacheLevelSizes[0] > 0 && CacheLevelSizes[1] > 0 &&
+ CacheLevelAssociativity[0] > 2 && CacheLevelAssociativity[1] > 2))
+ return {1, 1, 1};
+ int Cbr = floor(
+ (CacheLevelAssociativity[0] - 1) /
+ (1 + static_cast<double>(MicroKernelParams.Mr) / MicroKernelParams.Nr));
+ int Kc = (Cbr * CacheLevelSizes[0]) /
+ (MicroKernelParams.Nr * CacheLevelAssociativity[0] * 8);
+ double Cac = static_cast<double>(MicroKernelParams.Mr * Kc * 8 *
+ CacheLevelAssociativity[1]) /
+ CacheLevelSizes[1];
+ double Cbc = static_cast<double>(MicroKernelParams.Nr * Kc * 8 *
+ CacheLevelAssociativity[1]) /
+ CacheLevelSizes[1];
+ int Mc = floor(MicroKernelParams.Mr / Cac);
+ int Nc =
+ floor((MicroKernelParams.Nr * (CacheLevelAssociativity[1] - 2)) / Cbc);
+ return {Mc, Nc, Kc};
+}
+
__isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeMatMulPattern(
__isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) {
assert(TTI && "The target transform info should be provided.");
auto MicroKernelParams = getMicroKernelParams(TTI);
+ auto MacroKernelParams = getMacroKernelParams(MicroKernelParams);
+ Node = createMacroKernel(Node, MacroKernelParams);
Node = createMicroKernel(Node, MicroKernelParams);
return Node;
}
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll
index 51c9da2978c..45a352c8d37 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll
@@ -1,4 +1,5 @@
; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -polly-target-througput-vector-fma=1 -polly-target-latency-vector-fma=8 -analyze -polly-ast < %s 2>&1 | FileCheck %s
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -polly-target-througput-vector-fma=1 -polly-target-latency-vector-fma=8 -analyze -polly-ast -polly-target-cache-level-associativity=8,8 -polly-target-cache-level-sizes=32768,262144 < %s 2>&1 | FileCheck %s --check-prefix=EXTRACTION-OF-MACRO-KERNEL
;
; /* C := alpha*A*B + beta*C */
; for (i = 0; i < _PB_NI; i++)
@@ -62,6 +63,56 @@
; CHECK: }
; CHECK: }
;
+; EXTRACTION-OF-MACRO-KERNEL: // 1st level tiling - Tiles
+; EXTRACTION-OF-MACRO-KERNEL: for (int c0 = 0; c0 <= 65; c0 += 1)
+; EXTRACTION-OF-MACRO-KERNEL: for (int c1 = 0; c1 <= 3; c1 += 1)
+; EXTRACTION-OF-MACRO-KERNEL: for (int c2 = 0; c2 <= 10; c2 += 1) {
+; EXTRACTION-OF-MACRO-KERNEL: // 1st level tiling - Points
+; EXTRACTION-OF-MACRO-KERNEL: // Register tiling - Tiles
+; EXTRACTION-OF-MACRO-KERNEL: for (int c3 = 0; c3 <= 3; c3 += 1)
+; EXTRACTION-OF-MACRO-KERNEL: for (int c4 = 0; c4 <= 11; c4 += 1)
+; EXTRACTION-OF-MACRO-KERNEL: for (int c5 = 0; c5 <= 255; c5 += 1) {
+; EXTRACTION-OF-MACRO-KERNEL: // Register tiling - Points
+; EXTRACTION-OF-MACRO-KERNEL: // 1st level tiling - Tiles
+; EXTRACTION-OF-MACRO-KERNEL: // 1st level tiling - Points
+; EXTRACTION-OF-MACRO-KERNEL: {
+; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 1, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 2, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 3, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 4, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 5, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 6, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 7, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 1, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 2, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 3, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 4, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 5, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 6, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 7, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 1, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 2, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 3, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 4, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 5, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 6, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 7, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 1, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 2, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 3, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 4, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 5, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 6, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 7, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL: }
+; EXTRACTION-OF-MACRO-KERNEL: }
+; EXTRACTION-OF-MACRO-KERNEL: }
+; EXTRACTION-OF-MACRO-KERNEL: }
+;
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"
OpenPOWER on IntegriCloud