Index: include/polly/ScheduleOptimizer.h =================================================================== --- include/polly/ScheduleOptimizer.h +++ include/polly/ScheduleOptimizer.h @@ -115,8 +115,18 @@ /// is a loop around a rank-1 (i.e., outer product) update. /// /// We create the BLIS micro-kernel by applying a combination of tiling - /// and unrolling. In subsequent changes we will add the extraction - /// of the BLIS macro-kernel and implement the packing transformation. + /// and unrolling. The BLIS macro-kernel is created by applying + /// a combination of tiling and interchanging The description of calculated + /// values and utilized algorithms can be found in + /// (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf). + /// However, it should be noticed that in our case matrixes are stored in + /// row-major order. Consequently we should use formulas specified for + /// the matrix A to calculate values related to the matrix B and vice versa. + /// We should also take into account the row-major order, when we perform + /// tiling and interchanging of loops to create the BLIS micro-kernel and + /// the BLIS macro-kernel. + /// + /// In subsequent changes we will implement the packing transformation. /// /// It is assumed that the Node is successfully checked /// by ScheduleTreeOptimizer::isMatrMultPattern. Consequently @@ -227,6 +237,19 @@ /// /// @param Node The node to check. static bool isMatrMultPattern(__isl_keep isl_schedule_node *Node); + + /// @brief Create the BLIS micro-kernel + /// + /// We create the BLIS macro-kernel by applying a combination of tiling + /// and interchanging. + /// + /// @param Node The schedule node to be modified. + /// @param Mr The parameter of the BLIS micro-kernel. + /// @param Nr The parameter of the BLIS micro-kernel. + /// + /// @see ScheduleTreeOptimizer::optimizeMatMulPattern + static __isl_give isl_schedule_node * + createMacroKernel(__isl_take isl_schedule_node *Node, int Mr, int Nr); }; #endif Index: lib/Transform/ScheduleOptimizer.cpp =================================================================== --- lib/Transform/ScheduleOptimizer.cpp +++ lib/Transform/ScheduleOptimizer.cpp @@ -134,6 +134,17 @@ "instructions per clock cycle."), cl::Hidden, cl::init(1), cl::ZeroOrMore, cl::cat(PollyCategory)); +static cl::list CacheLevelAssociativityDegrees( + "polly-target-cache-level-associativity-degrees", + cl::desc("An associativity degree of each cache level."), cl::Hidden, + cl::ZeroOrMore, cl::CommaSeparated, cl::cat(PollyCategory)); + +static cl::list + CacheLevelSizes("polly-target-level-cache-sizes", + cl::desc("A size of each cache level specified in bytes."), + cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, + cl::cat(PollyCategory)); + static cl::opt FirstLevelDefaultTileSize( "polly-default-tile-size", cl::desc("The default tile size (if not enough were provided by" @@ -493,10 +504,68 @@ return isl_map_set_tuple_id(IslMap, isl_dim_in, InputDimsId); } +/// @brief Permute two dimensions of the band node. +/// +/// Permute FirstDim and SecondDim dimensions of the Node. +/// +/// @param Node The band node to be modified. +/// @param FirstDim The first dimension to be permuted. +/// @param SecondDim The second dimension to be permuted. +static __isl_give isl_schedule_node * +permuteBandNodeDimensions(__isl_take isl_schedule_node *Node, unsigned FirstDim, + unsigned SecondDim) { + assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band && + isl_schedule_node_band_n_member(Node) > std::max(FirstDim, SecondDim)); + auto PartialSchedule = isl_schedule_node_band_get_partial_schedule(Node); + auto PartialScheduleFirstDim = + isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, FirstDim); + auto PartialScheduleSecondDim = + isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, SecondDim); + PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff( + PartialSchedule, SecondDim, PartialScheduleFirstDim); + PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff( + PartialSchedule, FirstDim, PartialScheduleSecondDim); + Node = isl_schedule_node_delete(Node); + Node = isl_schedule_node_insert_partial_schedule(Node, PartialSchedule); + return Node; +} + +__isl_give isl_schedule_node * +ScheduleTreeOptimizer::createMacroKernel(__isl_take isl_schedule_node *Node, + int Mr, int Nr) { + assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band); + // According to www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf, + // it requires information about the first two levels of a cache to determine + // all the parameters of a macro-kernel. It also checks that an associativity + // degree of a cache level is greater that two. Otherwise, another algorithm + // for determination of the parameters should be used. + if (!(Mr > 0 && Nr > 0 && CacheLevelSizes.size() >= 2 && + CacheLevelAssociativityDegrees.size() >= 2 && CacheLevelSizes[0] > 0 && + CacheLevelSizes[1] > 0 && CacheLevelAssociativityDegrees[0] > 2 && + CacheLevelAssociativityDegrees[1] > 2)) + return Node; + int Cbr = floor((CacheLevelAssociativityDegrees[0] - 1) / + (1 + static_cast(Mr) / Nr)); + int Kc = + (Cbr * CacheLevelSizes[0]) / (Nr * CacheLevelAssociativityDegrees[0] * 8); + double Cac = + static_cast(Mr * Kc * 8 * CacheLevelAssociativityDegrees[1]) / + CacheLevelSizes[1]; + double Cbc = + static_cast(Nr * Kc * 8 * CacheLevelAssociativityDegrees[1]) / + CacheLevelSizes[1]; + int Mc = floor(Mr / Cac); + int Nc = floor((Nr * (CacheLevelAssociativityDegrees[1] - 2)) / Cbc); + int MacroKernelParams[] = {Mc, Nc, Kc}; + Node = tileNode(Node, "1nd level tiling", MacroKernelParams, 1); + Node = isl_schedule_node_parent(isl_schedule_node_parent(Node)); + Node = permuteBandNodeDimensions(Node, 1, 2); + return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0); +} + __isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeMatMulPattern( __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) { assert(TTI && "The target transform info should be provided."); - // Get a micro-kernel. // Nvec - Number of double-precision floating-point numbers that can be hold // by a vector register. Use 2 by default. auto Nvec = TTI->getRegisterBitWidth(true) / 64; @@ -505,7 +574,9 @@ int Nr = ceil(sqrt(Nvec * LatencyVectorFma * ThrougputVectorFma) / Nvec) * Nvec; int Mr = ceil(Nvec * LatencyVectorFma * ThrougputVectorFma / Nr); - std::vector MicroKernelParams{Mr, Nr}; + Node = createMacroKernel(Node, Mr, Nr); + // Get a micro-kernel. + int MicroKernelParams[] = {Mr, Nr}; Node = applyRegisterTiling(Node, MicroKernelParams, 1); return Node; } Index: test/ScheduleOptimizer/pattern-matching-based-opts_3.ll =================================================================== --- test/ScheduleOptimizer/pattern-matching-based-opts_3.ll +++ test/ScheduleOptimizer/pattern-matching-based-opts_3.ll @@ -1,4 +1,5 @@ ; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -polly-target-througput-vector-fma=1 -polly-target-latency-vector-fma=8 -analyze -polly-ast < %s 2>&1 | FileCheck %s +; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -polly-target-througput-vector-fma=1 -polly-target-latency-vector-fma=8 -analyze -polly-ast -polly-target-cache-level-associativity-degrees=8,8 -polly-target-level-cache-sizes=32768,262144 < %s 2>&1 | FileCheck %s --check-prefix=EXTRACTION-OF-MACRO-KERNEL ; ; /* C := alpha*A*B + beta*C */ ; for (i = 0; i < _PB_NI; i++) @@ -62,6 +63,56 @@ ; CHECK: } ; CHECK: } ; +; EXTRACTION-OF-MACRO-KERNEL: // 1nd level tiling - Tiles +; EXTRACTION-OF-MACRO-KERNEL: for (int c0 = 0; c0 <= 65; c0 += 1) +; EXTRACTION-OF-MACRO-KERNEL: for (int c1 = 0; c1 <= 3; c1 += 1) +; EXTRACTION-OF-MACRO-KERNEL: for (int c2 = 0; c2 <= 10; c2 += 1) { +; EXTRACTION-OF-MACRO-KERNEL: // 1nd level tiling - Points +; EXTRACTION-OF-MACRO-KERNEL: // Register tiling - Tiles +; EXTRACTION-OF-MACRO-KERNEL: for (int c3 = 0; c3 <= 3; c3 += 1) +; EXTRACTION-OF-MACRO-KERNEL: for (int c4 = 0; c4 <= 11; c4 += 1) +; EXTRACTION-OF-MACRO-KERNEL: for (int c5 = 0; c5 <= 255; c5 += 1) { +; EXTRACTION-OF-MACRO-KERNEL: // Register tiling - Points +; EXTRACTION-OF-MACRO-KERNEL: // 1st level tiling - Tiles +; EXTRACTION-OF-MACRO-KERNEL: // 1st level tiling - Points +; EXTRACTION-OF-MACRO-KERNEL: { +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 1, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 2, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 4, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 5, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 6, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 7, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 1, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 2, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 4, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 5, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 6, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 7, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 1, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 2, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 4, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 5, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 6, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 7, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 1, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 2, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 4, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 5, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 6, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 7, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL: } +; EXTRACTION-OF-MACRO-KERNEL: } +; EXTRACTION-OF-MACRO-KERNEL: } +; EXTRACTION-OF-MACRO-KERNEL: } +; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown"