Index: include/polly/ScheduleOptimizer.h
===================================================================
--- include/polly/ScheduleOptimizer.h
+++ include/polly/ScheduleOptimizer.h
@@ -115,8 +115,18 @@
   /// is a loop around a rank-1 (i.e., outer product) update.
   ///
   /// We create the BLIS micro-kernel by applying a combination of tiling
-  /// and unrolling. In subsequent changes we will add the extraction
-  /// of the BLIS macro-kernel and implement the packing transformation.
+  /// and unrolling. The BLIS macro-kernel is created by applying
+  /// a combination of tiling and interchanging The description of calculated
+  /// values and utilized algorithms can be found in
+  /// (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
+  /// However, it should be noticed that in our case matrixes are stored in
+  /// row-major order. Consequently we should use formulas specified for
+  /// the matrix A to calculate values related to the matrix B and vice versa.
+  /// We should also take into account the row-major order, when we perform
+  /// tiling and interchanging of loops to create the BLIS micro-kernel and
+  /// the BLIS macro-kernel.
+  ///
+  /// In subsequent changes we will implement the packing transformation.
   ///
   /// It is assumed that the Node is successfully checked
   /// by ScheduleTreeOptimizer::isMatrMultPattern. Consequently
@@ -227,6 +237,19 @@
   ///
   /// @param Node The node to check.
   static bool isMatrMultPattern(__isl_keep isl_schedule_node *Node);
+
+  /// @brief Create the BLIS micro-kernel
+  ///
+  /// We create the BLIS macro-kernel by applying a combination of tiling
+  /// and interchanging.
+  ///
+  /// @param Node The schedule node to be modified.
+  /// @param Mr The parameter of the BLIS micro-kernel.
+  /// @param Nr The parameter of the BLIS micro-kernel.
+  ///
+  /// @see ScheduleTreeOptimizer::optimizeMatMulPattern
+  static __isl_give isl_schedule_node *
+  createMacroKernel(__isl_take isl_schedule_node *Node, int Mr, int Nr);
 };
 
 #endif
Index: lib/Transform/ScheduleOptimizer.cpp
===================================================================
--- lib/Transform/ScheduleOptimizer.cpp
+++ lib/Transform/ScheduleOptimizer.cpp
@@ -134,6 +134,17 @@
              "instructions per clock cycle."),
     cl::Hidden, cl::init(1), cl::ZeroOrMore, cl::cat(PollyCategory));
 
+static cl::list<int> CacheLevelAssociativityDegrees(
+    "polly-target-cache-level-associativity-degrees",
+    cl::desc("An associativity degree of each cache level."), cl::Hidden,
+    cl::ZeroOrMore, cl::CommaSeparated, cl::cat(PollyCategory));
+
+static cl::list<int>
+    CacheLevelSizes("polly-target-level-cache-sizes",
+                    cl::desc("A size of each cache level specified in bytes."),
+                    cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
+                    cl::cat(PollyCategory));
+
 static cl::opt<int> FirstLevelDefaultTileSize(
     "polly-default-tile-size",
     cl::desc("The default tile size (if not enough were provided by"
@@ -493,10 +504,68 @@
   return isl_map_set_tuple_id(IslMap, isl_dim_in, InputDimsId);
 }
 
+/// @brief Permute two dimensions of the band node.
+///
+/// Permute FirstDim and SecondDim dimensions of the Node.
+///
+/// @param Node The band node to be modified.
+/// @param FirstDim The first dimension to be permuted.
+/// @param SecondDim The second dimension to be permuted.
+static __isl_give isl_schedule_node *
+permuteBandNodeDimensions(__isl_take isl_schedule_node *Node, unsigned FirstDim,
+                          unsigned SecondDim) {
+  assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band &&
+         isl_schedule_node_band_n_member(Node) > std::max(FirstDim, SecondDim));
+  auto PartialSchedule = isl_schedule_node_band_get_partial_schedule(Node);
+  auto PartialScheduleFirstDim =
+      isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, FirstDim);
+  auto PartialScheduleSecondDim =
+      isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, SecondDim);
+  PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff(
+      PartialSchedule, SecondDim, PartialScheduleFirstDim);
+  PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff(
+      PartialSchedule, FirstDim, PartialScheduleSecondDim);
+  Node = isl_schedule_node_delete(Node);
+  Node = isl_schedule_node_insert_partial_schedule(Node, PartialSchedule);
+  return Node;
+}
+
+__isl_give isl_schedule_node *
+ScheduleTreeOptimizer::createMacroKernel(__isl_take isl_schedule_node *Node,
+                                         int Mr, int Nr) {
+  assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
+  // According to www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf,
+  // it requires information about the first two levels of a cache to determine
+  // all the parameters of a macro-kernel. It also checks that an associativity
+  // degree of a cache level is greater that two. Otherwise, another algorithm
+  // for determination of the parameters should be used.
+  if (!(Mr > 0 && Nr > 0 && CacheLevelSizes.size() >= 2 &&
+        CacheLevelAssociativityDegrees.size() >= 2 && CacheLevelSizes[0] > 0 &&
+        CacheLevelSizes[1] > 0 && CacheLevelAssociativityDegrees[0] > 2 &&
+        CacheLevelAssociativityDegrees[1] > 2))
+    return Node;
+  int Cbr = floor((CacheLevelAssociativityDegrees[0] - 1) /
+                  (1 + static_cast<double>(Mr) / Nr));
+  int Kc =
+      (Cbr * CacheLevelSizes[0]) / (Nr * CacheLevelAssociativityDegrees[0] * 8);
+  double Cac =
+      static_cast<double>(Mr * Kc * 8 * CacheLevelAssociativityDegrees[1]) /
+      CacheLevelSizes[1];
+  double Cbc =
+      static_cast<double>(Nr * Kc * 8 * CacheLevelAssociativityDegrees[1]) /
+      CacheLevelSizes[1];
+  int Mc = floor(Mr / Cac);
+  int Nc = floor((Nr * (CacheLevelAssociativityDegrees[1] - 2)) / Cbc);
+  int MacroKernelParams[] = {Mc, Nc, Kc};
+  Node = tileNode(Node, "1nd level tiling", MacroKernelParams, 1);
+  Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
+  Node = permuteBandNodeDimensions(Node, 1, 2);
+  return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
+}
+
 __isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeMatMulPattern(
     __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) {
   assert(TTI && "The target transform info should be provided.");
-  // Get a micro-kernel.
   // Nvec - Number of double-precision floating-point numbers that can be hold
   // by a vector register. Use 2 by default.
   auto Nvec = TTI->getRegisterBitWidth(true) / 64;
@@ -505,7 +574,9 @@
   int Nr =
       ceil(sqrt(Nvec * LatencyVectorFma * ThrougputVectorFma) / Nvec) * Nvec;
   int Mr = ceil(Nvec * LatencyVectorFma * ThrougputVectorFma / Nr);
-  std::vector<int> MicroKernelParams{Mr, Nr};
+  Node = createMacroKernel(Node, Mr, Nr);
+  // Get a micro-kernel.
+  int MicroKernelParams[] = {Mr, Nr};
   Node = applyRegisterTiling(Node, MicroKernelParams, 1);
   return Node;
 }
Index: test/ScheduleOptimizer/pattern-matching-based-opts_3.ll
===================================================================
--- test/ScheduleOptimizer/pattern-matching-based-opts_3.ll
+++ test/ScheduleOptimizer/pattern-matching-based-opts_3.ll
@@ -1,4 +1,5 @@
 ; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -polly-target-througput-vector-fma=1 -polly-target-latency-vector-fma=8 -analyze -polly-ast < %s 2>&1 | FileCheck %s
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -polly-target-througput-vector-fma=1 -polly-target-latency-vector-fma=8 -analyze -polly-ast -polly-target-cache-level-associativity-degrees=8,8 -polly-target-level-cache-sizes=32768,262144 < %s 2>&1 | FileCheck %s --check-prefix=EXTRACTION-OF-MACRO-KERNEL
 ;
 ;    /* C := alpha*A*B + beta*C */
 ;    for (i = 0; i < _PB_NI; i++)
@@ -62,6 +63,56 @@
 ; CHECK:          }
 ; CHECK:    }
 ;
+; EXTRACTION-OF-MACRO-KERNEL:      // 1nd level tiling - Tiles
+; EXTRACTION-OF-MACRO-KERNEL:      for (int c0 = 0; c0 <= 65; c0 += 1)
+; EXTRACTION-OF-MACRO-KERNEL:        for (int c1 = 0; c1 <= 3; c1 += 1)
+; EXTRACTION-OF-MACRO-KERNEL:          for (int c2 = 0; c2 <= 10; c2 += 1) {
+; EXTRACTION-OF-MACRO-KERNEL:            // 1nd level tiling - Points
+; EXTRACTION-OF-MACRO-KERNEL:            // Register tiling - Tiles
+; EXTRACTION-OF-MACRO-KERNEL:            for (int c3 = 0; c3 <= 3; c3 += 1)
+; EXTRACTION-OF-MACRO-KERNEL:              for (int c4 = 0; c4 <= 11; c4 += 1)
+; EXTRACTION-OF-MACRO-KERNEL:                for (int c5 = 0; c5 <= 255; c5 += 1) {
+; EXTRACTION-OF-MACRO-KERNEL:                  // Register tiling - Points
+; EXTRACTION-OF-MACRO-KERNEL:                  // 1st level tiling - Tiles
+; EXTRACTION-OF-MACRO-KERNEL:                  // 1st level tiling - Points
+; EXTRACTION-OF-MACRO-KERNEL:                  {
+; EXTRACTION-OF-MACRO-KERNEL:                    Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL:                    Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 1, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL:                    Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 2, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL:                    Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 3, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL:                    Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 4, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL:                    Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 5, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL:                    Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 6, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL:                    Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 7, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL:                    Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL:                    Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 1, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL:                    Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 2, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL:                    Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 3, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL:                    Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 4, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL:                    Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 5, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL:                    Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 6, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL:                    Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 7, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL:                    Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL:                    Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 1, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL:                    Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 2, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL:                    Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 3, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL:                    Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 4, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL:                    Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 5, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL:                    Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 6, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL:                    Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 7, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL:                    Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL:                    Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 1, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL:                    Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 2, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL:                    Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 3, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL:                    Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 4, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL:                    Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 5, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL:                    Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 6, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL:                    Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 7, 256 * c1 + c5);
+; EXTRACTION-OF-MACRO-KERNEL:                  }
+; EXTRACTION-OF-MACRO-KERNEL:                }
+; EXTRACTION-OF-MACRO-KERNEL:          }
+; EXTRACTION-OF-MACRO-KERNEL:    }
+;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"