diff options
author | Roman Gareev <gareevroman@gmail.com> | 2016-07-25 09:42:53 +0000 |
---|---|---|
committer | Roman Gareev <gareevroman@gmail.com> | 2016-07-25 09:42:53 +0000 |
commit | 3a18a931a8f47879c4b246600a2bd59ca4f61c82 (patch) | |
tree | c145b0d3c80f4f6acb23c00b91a3af4964fbc4b1 | |
parent | 73f5c785c103edb05a5619c30eff040511816561 (diff) | |
download | bcm5719-llvm-3a18a931a8f47879c4b246600a2bd59ca4f61c82.tar.gz bcm5719-llvm-3a18a931a8f47879c4b246600a2bd59ca4f61c82.zip |
Apply all necessary tilings and interchangings to get a macro-kernel
This is the second patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus
two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update. In this change
we create the BLIS macro-kernel by applying a combination of tiling
and interchanging. In subsequent changes we will implement the packing
transformation.
Reviewed-by: Tobias Grosser <tobias@grosser.es>
Differential Revision: http://reviews.llvm.org/D21491
llvm-svn: 276627
-rw-r--r-- | polly/include/polly/ScheduleOptimizer.h | 54 | ||||
-rw-r--r-- | polly/lib/Transform/ScheduleOptimizer.cpp | 95 | ||||
-rw-r--r-- | polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll | 51 |
3 files changed, 186 insertions, 14 deletions
diff --git a/polly/include/polly/ScheduleOptimizer.h b/polly/include/polly/ScheduleOptimizer.h index f41fe1d99db..38ecb011c8d 100644 --- a/polly/include/polly/ScheduleOptimizer.h +++ b/polly/include/polly/ScheduleOptimizer.h @@ -30,6 +30,17 @@ struct MicroKernelParamsTy { int Nr; }; +/// @brief Parameters of the macro kernel. +/// +/// Parameters, which determine sizes of blocks of partitioned matrices +/// used in the optimized matrix multiplication. +/// +struct MacroKernelParamsTy { + int Mc; + int Nc; + int Kc; +}; + namespace polly { extern bool DisablePollyTiling; class Scop; @@ -115,10 +126,10 @@ private: applyRegisterTiling(__isl_take isl_schedule_node *Node, llvm::ArrayRef<int> TileSizes, int DefaultTileSize); - /// @brief Apply the BLIS matmul optimization pattern + /// @brief Apply the BLIS matmul optimization pattern. /// - /// Apply the BLIS matmul optimization pattern. BLIS implements gemm - /// as three nested loops around a macro-kernel, plus two packing routines. + /// Apply the BLIS matmul optimization pattern. BLIS implements gemm as three + /// nested loops around a macro-kernel, plus two packing routines. /// The macro-kernel is implemented in terms of two additional loops around /// a micro-kernel. The micro-kernel is a loop around a rank-1 /// (i.e., outer product) update. @@ -129,19 +140,20 @@ private: /// Technical Report, 2014 /// http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf /// - /// We create the BLIS micro-kernel by applying a combination of tiling - /// and unrolling. In subsequent changes we will add the extraction - /// of the BLIS macro-kernel and implement the packing transformation. + /// In our case matrices are stored in row-major order, which is taken into + /// account during the creation of the BLIS kernels and the computation + /// of their parameters. /// - /// It is assumed that the Node is successfully checked - /// by ScheduleTreeOptimizer::isMatrMultPattern. Consequently - /// in case of matmul kernels the application of optimizeMatMulPattern - /// can lead to close-to-peak performance. Maybe it can be generalized - /// to effectively optimize the whole class of successfully checked - /// statements. + /// @see ScheduleTreeOptimizer::createMicroKernel + /// @see ScheduleTreeOptimizer::createMacroKernel + /// @see getMicroKernelParams + /// @see getMacroKernelParams /// - /// @param Node the node that contains a band to be optimized. - /// @return Modified isl_schedule_node. + /// TODO: Implement the packing transformation. + /// + /// @param Node The node that contains a band to be optimized. The node + /// is required to successfully pass + /// ScheduleTreeOptimizer::isMatrMultPattern. static __isl_give isl_schedule_node * optimizeMatMulPattern(__isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI); @@ -247,6 +259,20 @@ private: /// /// We create the BLIS macro-kernel by applying a combination of tiling /// of dimensions of the band node and interchanging of two innermost + /// modified dimensions. The values of of MacroKernelParams's fields are used + /// as tile sizes. + /// + /// @param Node The schedule node to be modified. + /// @param MacroKernelParams Parameters of the macro kernel + /// to be used as tile sizes. + static __isl_give isl_schedule_node * + createMacroKernel(__isl_take isl_schedule_node *Node, + MacroKernelParamsTy MacroKernelParams); + + /// @brief Create the BLIS macro-kernel. + /// + /// We create the BLIS macro-kernel by applying a combination of tiling + /// of dimensions of the band node and interchanging of two innermost /// modified dimensions. The values passed in MicroKernelParam are used /// as tile sizes. /// diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp index e1689e54aac..acecf9dda5e 100644 --- a/polly/lib/Transform/ScheduleOptimizer.cpp +++ b/polly/lib/Transform/ScheduleOptimizer.cpp @@ -134,6 +134,17 @@ static cl::opt<int> ThrougputVectorFma( "instructions per clock cycle."), cl::Hidden, cl::init(1), cl::ZeroOrMore, cl::cat(PollyCategory)); +static cl::list<int> + CacheLevelAssociativity("polly-target-cache-level-associativity", + cl::desc("The associativity of each cache level."), + cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, + cl::cat(PollyCategory)); + +static cl::list<int> CacheLevelSizes( + "polly-target-cache-level-sizes", + cl::desc("The size of each cache level specified in bytes."), cl::Hidden, + cl::ZeroOrMore, cl::CommaSeparated, cl::cat(PollyCategory)); + static cl::opt<int> FirstLevelDefaultTileSize( "polly-default-tile-size", cl::desc("The default tile size (if not enough were provided by" @@ -493,12 +504,52 @@ static __isl_give isl_map *circularShiftOutputDims(__isl_take isl_map *IslMap) { return isl_map_set_tuple_id(IslMap, isl_dim_in, InputDimsId); } +/// @brief Permute two dimensions of the band node. +/// +/// Permute FirstDim and SecondDim dimensions of the Node. +/// +/// @param Node The band node to be modified. +/// @param FirstDim The first dimension to be permuted. +/// @param SecondDim The second dimension to be permuted. +static __isl_give isl_schedule_node * +permuteBandNodeDimensions(__isl_take isl_schedule_node *Node, unsigned FirstDim, + unsigned SecondDim) { + assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band && + isl_schedule_node_band_n_member(Node) > std::max(FirstDim, SecondDim)); + auto PartialSchedule = isl_schedule_node_band_get_partial_schedule(Node); + auto PartialScheduleFirstDim = + isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, FirstDim); + auto PartialScheduleSecondDim = + isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, SecondDim); + PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff( + PartialSchedule, SecondDim, PartialScheduleFirstDim); + PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff( + PartialSchedule, FirstDim, PartialScheduleSecondDim); + Node = isl_schedule_node_delete(Node); + Node = isl_schedule_node_insert_partial_schedule(Node, PartialSchedule); + return Node; +} + __isl_give isl_schedule_node *ScheduleTreeOptimizer::createMicroKernel( __isl_take isl_schedule_node *Node, MicroKernelParamsTy MicroKernelParams) { return applyRegisterTiling(Node, {MicroKernelParams.Mr, MicroKernelParams.Nr}, 1); } +__isl_give isl_schedule_node *ScheduleTreeOptimizer::createMacroKernel( + __isl_take isl_schedule_node *Node, MacroKernelParamsTy MacroKernelParams) { + assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band); + if (MacroKernelParams.Mc == 1 && MacroKernelParams.Nc == 1 && + MacroKernelParams.Kc == 1) + return Node; + Node = tileNode( + Node, "1st level tiling", + {MacroKernelParams.Mc, MacroKernelParams.Nc, MacroKernelParams.Kc}, 1); + Node = isl_schedule_node_parent(isl_schedule_node_parent(Node)); + Node = permuteBandNodeDimensions(Node, 1, 2); + return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0); +} + /// Get parameters of the BLIS micro kernel. /// /// We choose the Mr and Nr parameters of the micro kernel to be large enough @@ -525,10 +576,54 @@ getMicroKernelParams(const llvm::TargetTransformInfo *TTI) { return {Mr, Nr}; } +/// Get parameters of the BLIS macro kernel. +/// +/// During the computation of matrix multiplication, blocks of partitioned +/// matrices are mapped to different layers of the memory hierarchy. +/// To optimize data reuse, blocks should be ideally kept in cache between +/// iterations. Since parameters of the macro kernel determine sizes of these +/// blocks, there are upper and lower bounds on these parameters. +/// +/// @param MicroKernelParams Parameters of the micro-kernel +/// to be taken into account. +/// @return The structure of type MacroKernelParamsTy. +/// @see MacroKernelParamsTy +/// @see MicroKernelParamsTy +static struct MacroKernelParamsTy +getMacroKernelParams(const MicroKernelParamsTy &MicroKernelParams) { + // According to www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf, + // it requires information about the first two levels of a cache to determine + // all the parameters of a macro-kernel. It also checks that an associativity + // degree of a cache level is greater than two. Otherwise, another algorithm + // for determination of the parameters should be used. + if (!(MicroKernelParams.Mr > 0 && MicroKernelParams.Nr > 0 && + CacheLevelSizes.size() >= 2 && CacheLevelAssociativity.size() >= 2 && + CacheLevelSizes[0] > 0 && CacheLevelSizes[1] > 0 && + CacheLevelAssociativity[0] > 2 && CacheLevelAssociativity[1] > 2)) + return {1, 1, 1}; + int Cbr = floor( + (CacheLevelAssociativity[0] - 1) / + (1 + static_cast<double>(MicroKernelParams.Mr) / MicroKernelParams.Nr)); + int Kc = (Cbr * CacheLevelSizes[0]) / + (MicroKernelParams.Nr * CacheLevelAssociativity[0] * 8); + double Cac = static_cast<double>(MicroKernelParams.Mr * Kc * 8 * + CacheLevelAssociativity[1]) / + CacheLevelSizes[1]; + double Cbc = static_cast<double>(MicroKernelParams.Nr * Kc * 8 * + CacheLevelAssociativity[1]) / + CacheLevelSizes[1]; + int Mc = floor(MicroKernelParams.Mr / Cac); + int Nc = + floor((MicroKernelParams.Nr * (CacheLevelAssociativity[1] - 2)) / Cbc); + return {Mc, Nc, Kc}; +} + __isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeMatMulPattern( __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) { assert(TTI && "The target transform info should be provided."); auto MicroKernelParams = getMicroKernelParams(TTI); + auto MacroKernelParams = getMacroKernelParams(MicroKernelParams); + Node = createMacroKernel(Node, MacroKernelParams); Node = createMicroKernel(Node, MicroKernelParams); return Node; } diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll index 51c9da2978c..45a352c8d37 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll @@ -1,4 +1,5 @@ ; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -polly-target-througput-vector-fma=1 -polly-target-latency-vector-fma=8 -analyze -polly-ast < %s 2>&1 | FileCheck %s +; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -polly-target-througput-vector-fma=1 -polly-target-latency-vector-fma=8 -analyze -polly-ast -polly-target-cache-level-associativity=8,8 -polly-target-cache-level-sizes=32768,262144 < %s 2>&1 | FileCheck %s --check-prefix=EXTRACTION-OF-MACRO-KERNEL ; ; /* C := alpha*A*B + beta*C */ ; for (i = 0; i < _PB_NI; i++) @@ -62,6 +63,56 @@ ; CHECK: } ; CHECK: } ; +; EXTRACTION-OF-MACRO-KERNEL: // 1st level tiling - Tiles +; EXTRACTION-OF-MACRO-KERNEL: for (int c0 = 0; c0 <= 65; c0 += 1) +; EXTRACTION-OF-MACRO-KERNEL: for (int c1 = 0; c1 <= 3; c1 += 1) +; EXTRACTION-OF-MACRO-KERNEL: for (int c2 = 0; c2 <= 10; c2 += 1) { +; EXTRACTION-OF-MACRO-KERNEL: // 1st level tiling - Points +; EXTRACTION-OF-MACRO-KERNEL: // Register tiling - Tiles +; EXTRACTION-OF-MACRO-KERNEL: for (int c3 = 0; c3 <= 3; c3 += 1) +; EXTRACTION-OF-MACRO-KERNEL: for (int c4 = 0; c4 <= 11; c4 += 1) +; EXTRACTION-OF-MACRO-KERNEL: for (int c5 = 0; c5 <= 255; c5 += 1) { +; EXTRACTION-OF-MACRO-KERNEL: // Register tiling - Points +; EXTRACTION-OF-MACRO-KERNEL: // 1st level tiling - Tiles +; EXTRACTION-OF-MACRO-KERNEL: // 1st level tiling - Points +; EXTRACTION-OF-MACRO-KERNEL: { +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 1, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 2, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 4, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 5, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 6, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 7, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 1, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 2, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 4, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 5, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 6, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 7, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 1, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 2, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 4, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 5, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 6, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 7, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 1, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 2, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 4, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 5, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 6, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 7, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: } +; EXTRACTION-OF-MACRO-KERNEL: } +; EXTRACTION-OF-MACRO-KERNEL: } +; EXTRACTION-OF-MACRO-KERNEL: } +; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown" |