Index: include/polly/ScheduleOptimizer.h =================================================================== --- include/polly/ScheduleOptimizer.h +++ include/polly/ScheduleOptimizer.h @@ -25,6 +25,27 @@ class Scop; } // namespace polly +/// @brief Parameters of the micro kernel. +/// +/// Parameters, which determine sizes of rank-1 (i.e., outer product) update +/// used in the optimized matrix multiplication. +/// +struct MicroKernelParamsTy { + int Mr; + int Nr; +}; + +/// @brief Parameters of the macro kernel. +/// +/// Parameters, which determine sizes of blocks of partitioned matrices +/// used in the optimized matrix multiplication. +/// +struct MacroKernelParamsTy { + int Mc; + int Nc; + int Kc; +}; + class ScheduleTreeOptimizer { public: /// @brief Apply schedule tree transformations. @@ -105,27 +126,34 @@ applyRegisterTiling(__isl_take isl_schedule_node *Node, llvm::ArrayRef TileSizes, int DefaultTileSize); - /// @brief Apply the BLIS matmul optimization pattern - /// - /// Apply the BLIS matmul optimization pattern - /// (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf). - /// BLIS implements gemm as three nested loops around a macro-kernel, - /// plus two packing routines. The macro-kernel is implemented in terms - /// of two additional loops around a micro-kernel. The micro-kernel - /// is a loop around a rank-1 (i.e., outer product) update. - /// - /// We create the BLIS micro-kernel by applying a combination of tiling - /// and unrolling. In subsequent changes we will add the extraction - /// of the BLIS macro-kernel and implement the packing transformation. - /// - /// It is assumed that the Node is successfully checked - /// by ScheduleTreeOptimizer::isMatrMultPattern. Consequently - /// in case of matmul kernels the application of optimizeMatMulPattern - /// can lead to close-to-peak performance. Maybe it can be generalized - /// to effectively optimize the whole class of successfully checked - /// statements. - /// - /// @param Node the node that contains a band to be optimized. + /// @brief Apply the BLIS matmul optimization pattern. + /// + /// Apply the BLIS matmul optimization pattern. BLIS implements gemm as three + /// nested loops around a macro-kernel, plus two packing routines. + /// The macro-kernel is implemented in terms of two additional loops around + /// a micro-kernel. The micro-kernel is a loop around a rank-1 + /// (i.e., outer product) update. + /// + /// For a detailed description please see: + /// Analytical Modeling is Enough for High Performance BLIS + /// Tze Meng Low, Francisco D Igual, Tyler M Smith, Enrique S Quintana-Orti + /// Technical Report, 2014 + /// http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf + /// + /// In our case matrices are stored in row-major order, which is taken into + /// account in creation of BLIS kernels and determination of their + /// parameters. + /// + /// @see ScheduleTreeOptimizer::createMicroKernel + /// @see ScheduleTreeOptimizer::createMacroKernel + /// @see getMicroKernelParams + /// @see getMacroKernelParams + /// + /// TODO: Implement the packing transformation. + /// + /// @param Node The node that contains a band to be optimized. It is assumed + /// that it is successfully checked. + /// by ScheduleTreeOptimizer::isMatrMultPattern. /// @return Modified isl_schedule_node. static __isl_give isl_schedule_node * optimizeMatMulPattern(__isl_take isl_schedule_node *Node, @@ -227,6 +255,35 @@ /// /// @param Node The node to check. static bool isMatrMultPattern(__isl_keep isl_schedule_node *Node); + + /// @brief Create the BLIS macro-kernel. + /// + /// We create the BLIS macro-kernel by applying a combination of tiling + /// of dimensions of the band node and interchanging of two innermost + /// modified dimensions. Values of MacroKernelParams's fields are used + /// as tile sizes. + /// + /// @param Node The schedule node to be modified. + /// @param MacroKernelParams Parameters of the macro kernel + /// to be used as tile sizes. + static __isl_give isl_schedule_node * + createMacroKernel(__isl_take isl_schedule_node *Node, + const MacroKernelParamsTy &MacroKernelParams); + + /// @brief Create the BLIS micro-kernel + /// + /// We create the BLIS micro-kernel by applying a combination of tiling + /// of two outer dimensions of the band node and unrolling of the modified + /// schedule dimensions. Values of MicroKernelParams's fields are used + /// as tile sizes. + /// + /// @param Node The schedule node to be modified. + /// @param MicroKernelParams Parameters of the micro kernel + /// to be used as tile sizes. + /// @see MicroKernelParamsTy + static __isl_give isl_schedule_node * + createMicroKernel(__isl_take isl_schedule_node *Node, + const MicroKernelParamsTy &MicroKernelParams); }; #endif Index: lib/Transform/ScheduleOptimizer.cpp =================================================================== --- lib/Transform/ScheduleOptimizer.cpp +++ lib/Transform/ScheduleOptimizer.cpp @@ -134,6 +134,17 @@ "instructions per clock cycle."), cl::Hidden, cl::init(1), cl::ZeroOrMore, cl::cat(PollyCategory)); +static cl::list + CacheLevelAssociativity("polly-target-cache-level-associativity", + cl::desc("The associativity of each cache level."), + cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, + cl::cat(PollyCategory)); + +static cl::list CacheLevelSizes( + "polly-target-cache-level-sizes", + cl::desc("The size of each cache level specified in bytes."), cl::Hidden, + cl::ZeroOrMore, cl::CommaSeparated, cl::cat(PollyCategory)); + static cl::opt FirstLevelDefaultTileSize( "polly-default-tile-size", cl::desc("The default tile size (if not enough were provided by" @@ -493,10 +504,69 @@ return isl_map_set_tuple_id(IslMap, isl_dim_in, InputDimsId); } -__isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeMatMulPattern( - __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) { +/// @brief Permute two dimensions of the band node. +/// +/// Permute FirstDim and SecondDim dimensions of the Node. +/// +/// @param Node The band node to be modified. +/// @param FirstDim The first dimension to be permuted. +/// @param SecondDim The second dimension to be permuted. +static __isl_give isl_schedule_node * +permuteBandNodeDimensions(__isl_take isl_schedule_node *Node, unsigned FirstDim, + unsigned SecondDim) { + assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band && + isl_schedule_node_band_n_member(Node) > std::max(FirstDim, SecondDim)); + auto PartialSchedule = isl_schedule_node_band_get_partial_schedule(Node); + auto PartialScheduleFirstDim = + isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, FirstDim); + auto PartialScheduleSecondDim = + isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, SecondDim); + PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff( + PartialSchedule, SecondDim, PartialScheduleFirstDim); + PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff( + PartialSchedule, FirstDim, PartialScheduleSecondDim); + Node = isl_schedule_node_delete(Node); + Node = isl_schedule_node_insert_partial_schedule(Node, PartialSchedule); + return Node; +} + +__isl_give isl_schedule_node *ScheduleTreeOptimizer::createMicroKernel( + __isl_take isl_schedule_node *Node, + const MicroKernelParamsTy &MicroKernelParams) { + return applyRegisterTiling(Node, {MicroKernelParams.Mr, MicroKernelParams.Nr}, + 1); +} + +__isl_give isl_schedule_node *ScheduleTreeOptimizer::createMacroKernel( + __isl_take isl_schedule_node *Node, + const MacroKernelParamsTy &MacroKernelParams) { + assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band); + if (MacroKernelParams.Mc == 1 && MacroKernelParams.Nc == 1 && + MacroKernelParams.Kc == 1) + return Node; + Node = tileNode( + Node, "1nd level tiling", + {MacroKernelParams.Mc, MacroKernelParams.Nc, MacroKernelParams.Kc}, 1); + Node = isl_schedule_node_parent(isl_schedule_node_parent(Node)); + Node = permuteBandNodeDimensions(Node, 1, 2); + return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0); +} + +/// Get parameters of the BLIS micro kernel. +/// +/// We try to determine Mr and Nr parameters of the micro kernel that +/// are large enough so that no stalls caused by the combination +/// of latencies and dependencies are introduced during the updates +/// the resulting matrix of the matrix multiplication. However, +/// they should also be smallest as possible to release more registers +/// for entries of multiplied matrices. +/// +/// @param TTI Target Transform Info. +/// @return The structure of type MicroKernelParamsTy. +/// @see MicroKernelParamsTy +static struct MicroKernelParamsTy +getMicroKernelParams(const llvm::TargetTransformInfo *TTI) { assert(TTI && "The target transform info should be provided."); - // Get a micro-kernel. // Nvec - Number of double-precision floating-point numbers that can be hold // by a vector register. Use 2 by default. auto Nvec = TTI->getRegisterBitWidth(true) / 64; @@ -505,8 +575,58 @@ int Nr = ceil(sqrt(Nvec * LatencyVectorFma * ThrougputVectorFma) / Nvec) * Nvec; int Mr = ceil(Nvec * LatencyVectorFma * ThrougputVectorFma / Nr); - std::vector MicroKernelParams{Mr, Nr}; - Node = applyRegisterTiling(Node, MicroKernelParams, 1); + return {Mr, Nr}; +} + +/// Get parameters of the BLIS macro kernel. +/// +/// During computation of matrix multiplication, blocks of partitioned +/// matrices are mapped to different layers of the memory hierarchy. +/// To optimize data reuse, blocks should be ideally kept in cache between +/// iterations. It, along with parameters of the micro-kernel, imposes bounds +/// on parameters of the macro kernel. +/// +/// @param MicroKernelParams Parameters of the micro-kernel +/// to be taken into account +/// @return The structure of type MacroKernelParamsTy +/// @see MacroKernelParamsTy +/// @see MicroKernelParamsTy +static struct MacroKernelParamsTy +getMacroKernelParams(const MicroKernelParamsTy &MicroKernelParams) { + // According to www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf, + // it requires information about the first two levels of a cache to determine + // all the parameters of a macro-kernel. It also checks that an associativity + // degree of a cache level is greater than two. Otherwise, another algorithm + // for determination of the parameters should be used. + if (!(MicroKernelParams.Mr > 0 && MicroKernelParams.Nr > 0 && + CacheLevelSizes.size() >= 2 && CacheLevelAssociativity.size() >= 2 && + CacheLevelSizes[0] > 0 && CacheLevelSizes[1] > 0 && + CacheLevelAssociativity[0] > 2 && CacheLevelAssociativity[1] > 2)) + return {1, 1, 1}; + int Cbr = floor( + (CacheLevelAssociativity[0] - 1) / + (1 + static_cast(MicroKernelParams.Mr) / MicroKernelParams.Nr)); + int Kc = (Cbr * CacheLevelSizes[0]) / + (MicroKernelParams.Nr * CacheLevelAssociativity[0] * 8); + double Cac = static_cast(MicroKernelParams.Mr * Kc * 8 * + CacheLevelAssociativity[1]) / + CacheLevelSizes[1]; + double Cbc = static_cast(MicroKernelParams.Nr * Kc * 8 * + CacheLevelAssociativity[1]) / + CacheLevelSizes[1]; + int Mc = floor(MicroKernelParams.Mr / Cac); + int Nc = + floor((MicroKernelParams.Nr * (CacheLevelAssociativity[1] - 2)) / Cbc); + return {Mc, Nc, Kc}; +} + +__isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeMatMulPattern( + __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) { + assert(TTI && "The target transform info should be provided."); + auto MicroKernelParams = getMicroKernelParams(TTI); + auto MacroKernelParams = getMacroKernelParams(MicroKernelParams); + Node = createMacroKernel(Node, MacroKernelParams); + Node = createMicroKernel(Node, MicroKernelParams); return Node; }