Index: include/polly/ScheduleOptimizer.h
===================================================================
--- include/polly/ScheduleOptimizer.h
+++ include/polly/ScheduleOptimizer.h
@@ -25,6 +25,27 @@
 class Scop;
 } // namespace polly
 
+/// @brief Parameters of the micro kernel.
+///
+/// Parameters, which determine sizes of rank-1 (i.e., outer product) update
+/// used in the optimized matrix multiplication.
+///
+struct MicroKernelParamsTy {
+  int Mr;
+  int Nr;
+};
+
+/// @brief Parameters of the macro kernel.
+///
+/// Parameters, which determine sizes of blocks of partitioned matrices
+/// used in the optimized matrix multiplication.
+///
+struct MacroKernelParamsTy {
+  int Mc;
+  int Nc;
+  int Kc;
+};
+
 class ScheduleTreeOptimizer {
 public:
   /// @brief Apply schedule tree transformations.
@@ -105,27 +126,34 @@
   applyRegisterTiling(__isl_take isl_schedule_node *Node,
                       llvm::ArrayRef<int> TileSizes, int DefaultTileSize);
 
-  /// @brief Apply the BLIS matmul optimization pattern
-  ///
-  /// Apply the BLIS matmul optimization pattern
-  /// (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
-  /// BLIS implements gemm as three nested loops around a macro-kernel,
-  /// plus two packing routines. The macro-kernel is implemented in terms
-  /// of two additional loops around a micro-kernel. The micro-kernel
-  /// is a loop around a rank-1 (i.e., outer product) update.
-  ///
-  /// We create the BLIS micro-kernel by applying a combination of tiling
-  /// and unrolling. In subsequent changes we will add the extraction
-  /// of the BLIS macro-kernel and implement the packing transformation.
-  ///
-  /// It is assumed that the Node is successfully checked
-  /// by ScheduleTreeOptimizer::isMatrMultPattern. Consequently
-  /// in case of matmul kernels the application of optimizeMatMulPattern
-  /// can lead to close-to-peak performance. Maybe it can be generalized
-  /// to effectively optimize the whole class of successfully checked
-  /// statements.
-  ///
-  /// @param Node the node that contains a band to be optimized.
+  /// @brief Apply the BLIS matmul optimization pattern.
+  ///
+  /// Apply the BLIS matmul optimization pattern. BLIS implements gemm as three
+  /// nested loops around a macro-kernel, plus two packing routines.
+  /// The macro-kernel is implemented in terms of two additional loops around
+  /// a micro-kernel. The micro-kernel is a loop around a rank-1
+  /// (i.e., outer product) update.
+  ///
+  /// For a detailed description please see:
+  /// Analytical Modeling is Enough for High Performance BLIS
+  /// Tze Meng Low, Francisco D Igual, Tyler M Smith, Enrique S Quintana-Orti
+  /// Technical Report, 2014
+  /// http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
+  ///
+  /// In our case matrices are stored in row-major order, which is taken into
+  /// account in creation of BLIS kernels and determination of their
+  /// parameters.
+  ///
+  /// @see ScheduleTreeOptimizer::createMicroKernel
+  /// @see ScheduleTreeOptimizer::createMacroKernel
+  /// @see getMicroKernelParams
+  /// @see getMacroKernelParams
+  ///
+  /// TODO: Implement the packing transformation.
+  ///
+  /// @param Node The node that contains a band to be optimized. It is assumed
+  ///             that it is successfully checked.
+  ///             by ScheduleTreeOptimizer::isMatrMultPattern.
   /// @return Modified isl_schedule_node.
   static __isl_give isl_schedule_node *
   optimizeMatMulPattern(__isl_take isl_schedule_node *Node,
@@ -227,6 +255,35 @@
   ///
   /// @param Node The node to check.
   static bool isMatrMultPattern(__isl_keep isl_schedule_node *Node);
+
+  /// @brief Create the BLIS macro-kernel.
+  ///
+  /// We create the BLIS macro-kernel by applying a combination of tiling
+  /// of dimensions of the band node and interchanging of two innermost
+  /// modified dimensions. Values of MacroKernelParams's fields are used
+  /// as tile sizes.
+  ///
+  /// @param Node The schedule node to be modified.
+  /// @param MacroKernelParams Parameters of the macro kernel
+  ///                          to be used as tile sizes.
+  static __isl_give isl_schedule_node *
+  createMacroKernel(__isl_take isl_schedule_node *Node,
+                    const MacroKernelParamsTy &MacroKernelParams);
+
+  /// @brief Create the BLIS micro-kernel
+  ///
+  /// We create the BLIS micro-kernel by applying a combination of tiling
+  /// of two outer dimensions of the band node and unrolling of the modified
+  /// schedule dimensions. Values of MicroKernelParams's fields are used
+  /// as tile sizes.
+  ///
+  /// @param Node The schedule node to be modified.
+  /// @param MicroKernelParams Parameters of the micro kernel
+  ///                          to be used as tile sizes.
+  /// @see MicroKernelParamsTy
+  static __isl_give isl_schedule_node *
+  createMicroKernel(__isl_take isl_schedule_node *Node,
+                    const MicroKernelParamsTy &MicroKernelParams);
 };
 
 #endif
Index: lib/Transform/ScheduleOptimizer.cpp
===================================================================
--- lib/Transform/ScheduleOptimizer.cpp
+++ lib/Transform/ScheduleOptimizer.cpp
@@ -134,6 +134,17 @@
              "instructions per clock cycle."),
     cl::Hidden, cl::init(1), cl::ZeroOrMore, cl::cat(PollyCategory));
 
+static cl::list<int>
+    CacheLevelAssociativity("polly-target-cache-level-associativity",
+                            cl::desc("The associativity of each cache level."),
+                            cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
+                            cl::cat(PollyCategory));
+
+static cl::list<int> CacheLevelSizes(
+    "polly-target-cache-level-sizes",
+    cl::desc("The size of each cache level specified in bytes."), cl::Hidden,
+    cl::ZeroOrMore, cl::CommaSeparated, cl::cat(PollyCategory));
+
 static cl::opt<int> FirstLevelDefaultTileSize(
     "polly-default-tile-size",
     cl::desc("The default tile size (if not enough were provided by"
@@ -493,10 +504,69 @@
   return isl_map_set_tuple_id(IslMap, isl_dim_in, InputDimsId);
 }
 
-__isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeMatMulPattern(
-    __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) {
+/// @brief Permute two dimensions of the band node.
+///
+/// Permute FirstDim and SecondDim dimensions of the Node.
+///
+/// @param Node The band node to be modified.
+/// @param FirstDim The first dimension to be permuted.
+/// @param SecondDim The second dimension to be permuted.
+static __isl_give isl_schedule_node *
+permuteBandNodeDimensions(__isl_take isl_schedule_node *Node, unsigned FirstDim,
+                          unsigned SecondDim) {
+  assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band &&
+         isl_schedule_node_band_n_member(Node) > std::max(FirstDim, SecondDim));
+  auto PartialSchedule = isl_schedule_node_band_get_partial_schedule(Node);
+  auto PartialScheduleFirstDim =
+      isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, FirstDim);
+  auto PartialScheduleSecondDim =
+      isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, SecondDim);
+  PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff(
+      PartialSchedule, SecondDim, PartialScheduleFirstDim);
+  PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff(
+      PartialSchedule, FirstDim, PartialScheduleSecondDim);
+  Node = isl_schedule_node_delete(Node);
+  Node = isl_schedule_node_insert_partial_schedule(Node, PartialSchedule);
+  return Node;
+}
+
+__isl_give isl_schedule_node *ScheduleTreeOptimizer::createMicroKernel(
+    __isl_take isl_schedule_node *Node,
+    const MicroKernelParamsTy &MicroKernelParams) {
+  return applyRegisterTiling(Node, {MicroKernelParams.Mr, MicroKernelParams.Nr},
+                             1);
+}
+
+__isl_give isl_schedule_node *ScheduleTreeOptimizer::createMacroKernel(
+    __isl_take isl_schedule_node *Node,
+    const MacroKernelParamsTy &MacroKernelParams) {
+  assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
+  if (MacroKernelParams.Mc == 1 && MacroKernelParams.Nc == 1 &&
+      MacroKernelParams.Kc == 1)
+    return Node;
+  Node = tileNode(
+      Node, "1nd level tiling",
+      {MacroKernelParams.Mc, MacroKernelParams.Nc, MacroKernelParams.Kc}, 1);
+  Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
+  Node = permuteBandNodeDimensions(Node, 1, 2);
+  return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
+}
+
+/// Get parameters of the BLIS micro kernel.
+///
+/// We try to determine Mr and Nr parameters of the micro kernel that
+/// are large enough so that no stalls caused by the combination
+/// of latencies and dependencies are introduced during the updates
+/// the resulting matrix of the matrix multiplication. However,
+/// they should also be smallest as possible to release more registers
+/// for entries of multiplied matrices.
+///
+/// @param TTI Target Transform Info.
+/// @return The structure of type MicroKernelParamsTy.
+/// @see MicroKernelParamsTy
+static struct MicroKernelParamsTy
+getMicroKernelParams(const llvm::TargetTransformInfo *TTI) {
   assert(TTI && "The target transform info should be provided.");
-  // Get a micro-kernel.
   // Nvec - Number of double-precision floating-point numbers that can be hold
   // by a vector register. Use 2 by default.
   auto Nvec = TTI->getRegisterBitWidth(true) / 64;
@@ -505,8 +575,58 @@
   int Nr =
       ceil(sqrt(Nvec * LatencyVectorFma * ThrougputVectorFma) / Nvec) * Nvec;
   int Mr = ceil(Nvec * LatencyVectorFma * ThrougputVectorFma / Nr);
-  std::vector<int> MicroKernelParams{Mr, Nr};
-  Node = applyRegisterTiling(Node, MicroKernelParams, 1);
+  return {Mr, Nr};
+}
+
+/// Get parameters of the BLIS macro kernel.
+///
+/// During computation of matrix multiplication, blocks of partitioned
+/// matrices are mapped to different layers of the memory hierarchy.
+/// To optimize data reuse, blocks should be ideally kept in cache between
+/// iterations. It, along with parameters of the micro-kernel, imposes bounds
+/// on parameters of the macro kernel.
+///
+/// @param MicroKernelParams Parameters of the micro-kernel
+///                          to be taken into account
+/// @return The structure of type MacroKernelParamsTy
+/// @see MacroKernelParamsTy
+/// @see MicroKernelParamsTy
+static struct MacroKernelParamsTy
+getMacroKernelParams(const MicroKernelParamsTy &MicroKernelParams) {
+  // According to www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf,
+  // it requires information about the first two levels of a cache to determine
+  // all the parameters of a macro-kernel. It also checks that an associativity
+  // degree of a cache level is greater than two. Otherwise, another algorithm
+  // for determination of the parameters should be used.
+  if (!(MicroKernelParams.Mr > 0 && MicroKernelParams.Nr > 0 &&
+        CacheLevelSizes.size() >= 2 && CacheLevelAssociativity.size() >= 2 &&
+        CacheLevelSizes[0] > 0 && CacheLevelSizes[1] > 0 &&
+        CacheLevelAssociativity[0] > 2 && CacheLevelAssociativity[1] > 2))
+    return {1, 1, 1};
+  int Cbr = floor(
+      (CacheLevelAssociativity[0] - 1) /
+      (1 + static_cast<double>(MicroKernelParams.Mr) / MicroKernelParams.Nr));
+  int Kc = (Cbr * CacheLevelSizes[0]) /
+           (MicroKernelParams.Nr * CacheLevelAssociativity[0] * 8);
+  double Cac = static_cast<double>(MicroKernelParams.Mr * Kc * 8 *
+                                   CacheLevelAssociativity[1]) /
+               CacheLevelSizes[1];
+  double Cbc = static_cast<double>(MicroKernelParams.Nr * Kc * 8 *
+                                   CacheLevelAssociativity[1]) /
+               CacheLevelSizes[1];
+  int Mc = floor(MicroKernelParams.Mr / Cac);
+  int Nc =
+      floor((MicroKernelParams.Nr * (CacheLevelAssociativity[1] - 2)) / Cbc);
+  return {Mc, Nc, Kc};
+}
+
+__isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeMatMulPattern(
+    __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) {
+  assert(TTI && "The target transform info should be provided.");
+  auto MicroKernelParams = getMicroKernelParams(TTI);
+  auto MacroKernelParams = getMacroKernelParams(MicroKernelParams);
+  Node = createMacroKernel(Node, MacroKernelParams);
+  Node = createMicroKernel(Node, MicroKernelParams);
   return Node;
 }