Index: polly/lib/Transform/MatmulOptimizer.cpp
===================================================================
--- polly/lib/Transform/MatmulOptimizer.cpp
+++ polly/lib/Transform/MatmulOptimizer.cpp
@@ -796,6 +796,46 @@
   return Node.graft_before(NewNode);
 }
 
+/// Create the copy statement and redirect access
+///
+/// Create a new statement, which copies data from the memory access location
+/// specified by the access relation @p AccRel to a location
+/// specified by @p AccRelPacked. Subsequently, change memory access
+/// relation @p AccRel of @p Stmt to @p AccRelPacked.
+///
+/// @param Stmt           The statement with an access @p AccRel
+///                       to be redirected.
+/// @param AccRel         The original access relation of @p Stmt.
+/// @param AccRelPacked   A new access relation.
+/// @param OuterDomainMap The copy statement domain.
+/// @param MapOldIndVar   The relation, which maps original induction variables
+///                       to the ones, which are produced by schedule
+///                       transformations.
+/// @return A new copy statement.
+static ScopStmt *addScopStmt(ScopStmt *Stmt, isl::map AccRel,
+                             isl::map AccRelPacked, isl::map OuterDomainMap,
+                             isl::map MapOldIndVar) {
+  // { Memref[] -> Packed[] }
+  isl::map PackedTranslator = AccRelPacked.apply_domain(AccRel);
+  isl::set Domain = Stmt->getDomain();
+  // Prevent the unbound optimum
+  isl::map CopyFrom = MapOldIndVar.intersect_domain(Domain);
+  // { Scatter[] -> MemrefA[] }
+  CopyFrom = CopyFrom.reverse().apply_range(AccRel);
+  // { Scatter[] -> CopyStmt[] }
+  isl::map DomainTranslator = OuterDomainMap.range_product(CopyFrom);
+  // { CopyStmt[] }
+  isl::set CopyDomain = DomainTranslator.range();
+
+  // Translate the access relations to the new domain.
+  // { CopyStmt[] -> MemrefA[] }
+  CopyFrom = CopyFrom.apply_domain(DomainTranslator);
+  // { CopyStmt[] -> PackedA[] }
+  isl::map CopyTo = CopyFrom.apply_range(PackedTranslator);
+
+  return Stmt->getParent()->addScopStmt(CopyFrom, CopyTo, CopyDomain);
+}
+
 static isl::schedule_node optimizePackedB(isl::schedule_node Node,
                                           ScopStmt *Stmt, isl::map MapOldIndVar,
                                           MicroKernelParamsTy MicroParams,
@@ -814,23 +854,29 @@
 
   // Compute the access relation for copying from B to PackedB.
   isl::map AccRelB = MMI.B->getLatestAccessRelation();
+  unsigned AccRelBDimOutNum = unsignedFromIslSize(AccRelB.dim(isl::dim::out));
   isl::map AccRelPackedB = getMatMulAccRel(MapOldIndVar, 3, 7);
   AccRelPackedB =
       AccRelPackedB.set_tuple_id(isl::dim::out, PackedB->getBasePtrId());
 
-  // Create the copy statement and redirect access.
-  ScopStmt *CopyStmt = S->addScopStmt(AccRelB, AccRelPackedB, Domain);
+  // Compute the domain for the copy statement.
+  // Construct the copy statement domain out of the 2 outermost scatter
+  // dimensions (to match the 2 band nodes surrounding the extension node) and
+  // the array elements to copy (one statement instance per array element).
+  // { Scatter[] }
+  isl::set ScatterDomain = MapOldIndVar.intersect_domain(Domain).range();
+  isl::map OuterDomainMap =
+      makeIdentityMap(ScatterDomain, true).project_out(isl::dim::out, 2, 7);
+
+  ScopStmt *CopyStmt =
+      addScopStmt(Stmt, AccRelB, AccRelPackedB, OuterDomainMap, MapOldIndVar);
   MMI.B->setNewAccessRelation(AccRelPackedB);
 
-  unsigned Dim = unsignedFromIslSize(MapOldIndVar.range_tuple_dim());
-  assert(Dim >= 2);
-  // Insert into the schedule tree.
-  isl::map ExtMap = MapOldIndVar.project_out(isl::dim::out, 2, Dim - 2);
-  ExtMap = ExtMap.reverse();
-  ExtMap = ExtMap.fix_si(isl::dim::out, MMI.i, 0);
-  ExtMap = ExtMap.intersect_range(Domain);
-  ExtMap = ExtMap.set_tuple_id(isl::dim::out, CopyStmt->getDomainId());
-  return createExtensionNode(Node, ExtMap);
+  // { Scatter[] -> CopyStmt[] }
+  isl::map ExtScatterCopy = makeIdentityMap(CopyStmt->getDomain(), true);
+  ExtScatterCopy =
+      ExtScatterCopy.project_out(isl::dim::in, 2, AccRelBDimOutNum);
+  return createExtensionNode(Node, ExtScatterCopy);
 }
 
 static isl::schedule_node optimizePackedA(isl::schedule_node Node, ScopStmt *,
@@ -853,11 +899,10 @@
 
   // Compute the access relation for copying from A to PackedA.
   isl::map AccRelA = MMI.A->getLatestAccessRelation();
+  unsigned AccRelADimOutNum = unsignedFromIslSize(AccRelA.dim(isl::dim::out));
   isl::map AccRelPackedA = getMatMulAccRel(MapOldIndVar, 4, 6);
   AccRelPackedA =
       AccRelPackedA.set_tuple_id(isl::dim::out, PackedA->getBasePtrId());
-  // { MemrefA[] -> PackedA[] }
-  isl::map PackedATranslator = AccRelPackedA.apply_domain(AccRelA);
 
   // Compute the domain for the copy statement.
   // Construct the copy statement domain out of the 3 outermost scatter
@@ -868,28 +913,17 @@
   // { Scatter[] -> OutermostScatter[] }
   isl::map OuterDomainMap =
       makeIdentityMap(ScatterDomain, true).project_out(isl::dim::out, 3, 6);
-  // { Scatter[] -> MemrefA[] }
-  isl::map CopyFrom = MapOldIndVar.reverse().apply_range(AccRelA);
-  // { Scatter[] -> CopyStmt[] }
-  isl::map DomainTranslator = OuterDomainMap.range_product(CopyFrom);
-  // { CopyStmt[] }
-  isl::set CopyDomain = DomainTranslator.range();
-
-  // Translate the access relations to the new domain.
-  // { CopyStmt[] -> MemrefA[] }
-  CopyFrom = CopyFrom.apply_domain(DomainTranslator);
-  // { CopyStmt[] -> PackedA[] }
-  isl::map CopyTo = CopyFrom.apply_range(PackedATranslator);
 
   // Create the copy statement and redirect access.
   ScopStmt *CopyStmt =
-      Stmt->getParent()->addScopStmt(CopyFrom, CopyTo, CopyDomain);
+      addScopStmt(Stmt, AccRelA, AccRelPackedA, OuterDomainMap, MapOldIndVar);
   MMI.A->setNewAccessRelation(AccRelPackedA);
 
   // Insert into the schedule tree.
   // { Scatter[] -> CopyStmt[] }
   isl::map ExtScatterCopy = makeIdentityMap(CopyStmt->getDomain(), true);
-  ExtScatterCopy = ExtScatterCopy.project_out(isl::dim::in, 3, 2);
+  ExtScatterCopy =
+      ExtScatterCopy.project_out(isl::dim::in, 3, AccRelADimOutNum);
   return createExtensionNode(Node, ExtScatterCopy);
 }
 
@@ -1818,6 +1852,253 @@
   return false;
 }
 
+/// Order the dimensions of operands of the tensor contraction.
+///
+/// To improve the spatial locality of the data, we use a heuristic, which is
+/// introduced in [1]. The heuristic logically reorders the tensor dimensions
+/// and consists of the following steps:
+///
+/// 1. Sort the dimensions in bundles I and J by increasing stride in
+/// the tensor C.
+/// 2. If tensor A contains the last dimension of C, swap A with B and I with J.
+/// 3. Sort the dimensions in the bundle P by increasing stride in the tensor A.
+///
+/// [1] - Devin Matthews. High-Performance Tensor Contraction without BLAS.
+/// SIAM Journal on Scientific Computing, 40(1). 2016. DOI: 10.1137/16M108968X.
+///
+/// @param TCI The information about the tensor contraction.
+static void orderDimensions(TCInfoTy &TCI) {
+  // Sort the dimensions in bundles I and J by increasing stride in
+  // the tensor C
+  for (const int &Dimension : TCI.CDimensions) {
+    assert(TCI.I.count(Dimension) || TCI.J.count(Dimension));
+    if (TCI.I.count(Dimension)) {
+      TCI.OrderedI.push_back(Dimension);
+      continue;
+    }
+    if (TCI.J.count(Dimension))
+      TCI.OrderedJ.push_back(Dimension);
+  }
+
+  // If tensor A contains the last dimension of C, swap A with B and I with J
+  if (TCI.I.count(TCI.CDimensions.back())) {
+    MemoryAccess *Access = TCI.A;
+    TCI.A = TCI.B;
+    TCI.B = Access;
+    TCI.I.swap(TCI.J);
+    TCI.ADimensions.swap(TCI.BDimensions);
+    TCI.OrderedI.swap(TCI.OrderedJ);
+  }
+
+  // Sort the dimensions in the bundle P by increasing stride in the tensor A
+  for (const int &Dimension : TCI.ADimensions)
+    if (TCI.P.count(Dimension))
+      TCI.OrderedP.push_back(Dimension);
+}
+
+/// Logically map dimensions of a tensor to dimensions of an imaginary matrix
+///
+/// flattenDimensions returns a new dimension, which is a linear combination of
+/// @p Dimensions in order specified by @p DimensionsOrder. Specifically, a new
+/// dimension J can be represented as
+/// J = Dimensions[D0] A0 + … + Dimensions[D(N - 1)] * A(N - 1),
+/// where Di = DimensionsOrder[i],
+/// Ai = DimensionsSizes[i+1] * … * DimensionsSizes[N - 1].
+///
+/// Using the described one-to-one mapping, which is introduced in [1], the
+/// opimization of the matrix multiplication of the corresponding imaginary
+/// matrices can be performed for the tensor contraction.
+///
+/// [1] - Devin Matthews. High-Performance Tensor Contraction without BLAS.
+/// SIAM Journal on Scientific Computing, 40(1). 2016. DOI: 10.1137/16M108968X.
+///
+/// @param Dimensions      Dimensions of the tensor.
+/// @param DimensionsOrder An order of dimensions in the linear combination.
+/// @param DimensionsSizes Sizes of dimensions in the linear combination.
+/// @return A new dimension.
+static isl::union_pw_aff
+flattenDimensions(isl::multi_union_pw_aff Dimensions,
+                  const SmallVector<int> &DimensionsOrder,
+                  const SmallVector<int> &DimensionsSizes) {
+  isl::union_pw_aff NewDimension;
+  isl::ctx Ctx = Dimensions.ctx();
+  for (const int &Dim : DimensionsOrder) {
+    // Handle the first element in the linear combination of dimensions
+    if (NewDimension.is_null()) {
+      NewDimension = isl::manage(
+          isl_multi_union_pw_aff_get_union_pw_aff(Dimensions.get(), Dim));
+      continue;
+    }
+
+    // Update the linear combination
+    int DimSize = DimensionsSizes[Dim];
+    if (DimSize > 0)
+      NewDimension = isl::manage(isl_union_pw_aff_scale_val(
+          NewDimension.release(), isl::val(Ctx, DimSize).release()));
+    else
+      NewDimension = isl::manage(isl_union_pw_aff_scale_val(
+          NewDimension.release(), isl::val(Ctx, 1).release()));
+
+    // Add a new element to the linear combination
+    NewDimension = NewDimension.add(isl::manage(
+        isl_multi_union_pw_aff_get_union_pw_aff(Dimensions.get(), Dim)));
+  }
+  return NewDimension;
+}
+
+/// Logically represent tensor contraction operands as matrix multiplication
+/// operands.
+///
+/// Convert the information about tensor contraction operands into the
+/// information about matrix multiplication operands.
+///
+/// @param TCI The information about the tensor contraction.
+/// @return The information about matrix multiplication operands.
+static MatMulInfoTy convertIntoMatMulInfo(const TCInfoTy &TCI) {
+  MatMulInfoTy NewMatMulInfo;
+  NewMatMulInfo.A = TCI.A;
+  NewMatMulInfo.B = TCI.B;
+  NewMatMulInfo.ReadFromC = TCI.ReadFromC;
+  NewMatMulInfo.WriteToC = TCI.WriteToC;
+  return NewMatMulInfo;
+}
+
+/// Logically cast the schedule tree, which represents the tensor contraction,
+/// into the schedule tree, which represents the matrix multiplication
+///
+/// convertIntoMMMSchedule maps dimensions of tensors to dimensions of imaginary
+/// matrices, which are operands of the matrix multiplication. Using the
+/// described one-to-one mapping, the opimization of the matrix
+/// multiplication of the corresponding imaginary matrices can be performed for
+/// the tensor contraction.
+///
+/// @param Node The schedule node to be optimized.
+/// @param TTI  Target Transform Info.
+/// @param TCI  The information about the tensor contraction.
+/// @return The optimized schedule node.
+static isl::schedule_node
+convertIntoMMMSchedule(isl::schedule_node Node,
+                       const llvm::TargetTransformInfo *TTI,
+                       const TCInfoTy &TCI) {
+  assert(TTI && "The target transform info should be provided.");
+
+  // Obtain dimensions of tensors
+  isl::union_set Domain = Node.get_universe_domain();
+  isl::union_pw_multi_aff IdentityUnionPwAff =
+      Domain.identity_union_pw_multi_aff();
+  auto Dimensions = isl::multi_union_pw_aff(IdentityUnionPwAff);
+  Dimensions = Dimensions.reset_tuple_id(isl::dim::set);
+
+  // Map dimensions of tensors to dimensions of imaginary matrices
+  isl::union_pw_aff IDimensions =
+      flattenDimensions(Dimensions, TCI.OrderedI, TCI.DimensionSizes);
+  isl::union_pw_aff JDimensions =
+      flattenDimensions(Dimensions, TCI.OrderedJ, TCI.DimensionSizes);
+  isl::union_pw_aff PDimensions =
+      flattenDimensions(Dimensions, TCI.OrderedP, TCI.DimensionSizes);
+
+  unsigned DimNumber = unsignedFromIslSize(Dimensions.dim(isl::dim::out));
+  Dimensions = isl::manage(isl_multi_union_pw_aff_drop_dims(
+      Dimensions.release(), isl_dim_out, 3, DimNumber - 3));
+  Dimensions = Dimensions.set_union_pw_aff(0, IDimensions);
+  Dimensions = Dimensions.set_union_pw_aff(1, JDimensions);
+  Dimensions = Dimensions.set_union_pw_aff(2, PDimensions);
+
+  // Modify the schedule tree
+  auto NodeType = isl_schedule_node_get_type(Node.get());
+  while ((NodeType != isl_schedule_node_domain) &&
+         (NodeType != isl_schedule_node_filter)) {
+    assert((NodeType != isl_schedule_node_sequence) &&
+           L"Prevent the undefined behavior");
+    Node = Node.parent();
+    NodeType = isl_schedule_node_get_type(Node.get());
+  }
+  Node = Node.child(0);
+  Node = isl::manage(isl_schedule_node_cut(Node.release()));
+  return Node.insert_partial_schedule(Dimensions);
+}
+
+/// Apply the BLIS matmul optimization pattern if possible.
+///
+/// Map dimensions of tensors to dimensions of imaginary matrices,
+/// which are operands of the matrix multiplication. Using the described
+/// one-to-one mapping, which is introduced in [1], the opimization of
+/// the matrix multiplication of the corresponding imaginary matrices
+/// can be performed for the tensor contraction.
+///
+/// Specifically, make the loops containing the matrix multiplication be the
+/// innermost loops and apply the BLIS matmul optimization pattern. BLIS
+/// implements matrix multiplication as three nested loops around
+/// a macro-kernel, plus two packing routines. The macro-kernel is implemented
+/// in terms of two additional loops around a micro-kernel.
+/// The micro-kernel is a loop around a rank-1 (i.e., outer product) update.
+///
+/// For a detailed description please see [2].
+///
+/// The order of the loops defines the data reused in the BLIS implementation
+/// of matrix multiplication ([2]). In particular, elements of the matrix B,
+/// the second operand of matrix multiplication, are reused between
+/// iterations of the innermost loop. To keep the reused data in cache, only
+/// elements of matrix A, the first operand of matrix multiplication,
+/// should be evicted during an iteration of the innermost loop. To provide
+/// such a cache replacement policy, elements of the matrix A can,
+/// in particular, be loaded first and, consequently, be least-recently-used.
+///
+/// In our case matrices are stored in row-major order instead of
+/// column-major order used in the BLIS implementation ([2]). It affects only
+/// on the form of the BLIS micro kernel and the computation of its
+/// parameters. In particular, reused elements of the matrix B are
+/// successively multiplied by specific elements of the matrix A.
+///
+/// Refs.:
+/// [1] - Devin Matthews. High-Performance Tensor Contraction without BLAS.
+/// SIAM Journal on Scientific Computing, 40(1). 2016. DOI: 10.1137/16M108968X.
+/// [2] - Analytical Modeling is Enough for High Performance BLIS
+/// Tze Meng Low, Francisco D Igual, Tyler M Smith, Enrique S Quintana-Orti
+/// Technical Report, 2014
+/// http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
+///
+/// @see ScheduleTreeOptimizer::createMicroKernel
+/// @see ScheduleTreeOptimizer::createMacroKernel
+/// @see getMicroKernelParams
+/// @see getMacroKernelParams
+///
+/// @param Node The node that contains a band to be optimized. The node
+///             is required to successfully pass
+///             ScheduleTreeOptimizer::isTCPattern.
+/// @param TTI  Target Transform Info.
+/// @param D    The dependencies.
+/// @param TCI  The information about the tensor contraction.
+///
+/// @returns    The transformed schedule or nullptr if the optimization
+///             cannot be applied.
+static isl::schedule_node
+optimizeTCPattern(isl::schedule_node Node, const llvm::TargetTransformInfo *TTI,
+                  TCInfoTy &TCI) {
+  orderDimensions(TCI);
+
+  assert(TTI && "The target transform info should be provided.");
+  Node = convertIntoMMMSchedule(Node, TTI, TCI);
+
+  MatMulInfoTy MMI = convertIntoMatMulInfo(TCI);
+  MicroKernelParamsTy MicroKernelParams = getMicroKernelParams(TTI, MMI);
+  MacroKernelParamsTy MacroKernelParams =
+      getMacroKernelParams(TTI, MicroKernelParams, MMI);
+  Node = createMacroKernel(Node, MacroKernelParams);
+  Node = createMicroKernel(Node, MicroKernelParams);
+  if (MacroKernelParams.Mc == 1 || MacroKernelParams.Nc == 1 ||
+      MacroKernelParams.Kc == 1)
+    return Node;
+  isl::map MapOldIndVar = getInductionVariablesSubstitution(
+      Node, MicroKernelParams, MacroKernelParams);
+  if (MapOldIndVar.is_null())
+    return Node;
+  Node = markLoopVectorizerDisabled(Node.parent()).child(0);
+  Node = isolateAndUnrollMatMulInnerLoops(Node, MicroKernelParams);
+  return optimizeDataLayoutMatrMulPattern(Node, MapOldIndVar, MicroKernelParams,
+                                          MacroKernelParams, MMI);
+}
+
 } // namespace
 
 isl::schedule_node
@@ -1825,8 +2106,10 @@
                                 const llvm::TargetTransformInfo *TTI,
                                 const Dependences *D) {
   TCInfoTy TCI;
-  if (PMBasedTCOpts && isTCPattern(Node, D, TCI))
+  if (PMBasedTCOpts && isTCPattern(Node, D, TCI)) {
     LLVM_DEBUG(dbgs() << "The tensor contraction pattern was detected\n");
+    return optimizeTCPattern(Node, TTI, TCI);
+  }
   MatMulInfoTy MMI;
   if (PMBasedMMMOpts && isMatrMultPattern(Node, D, MMI)) {
     LLVM_DEBUG(dbgs() << "The matrix multiplication pattern was detected\n");
Index: polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll
===================================================================
--- polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll
+++ polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll
@@ -30,9 +30,9 @@
 ; CHECK-NEXT:        }
 ; CHECK-NEXT:      // 1st level tiling - Tiles
 ; CHECK-NEXT:      for (int c1 = 0; c1 <= 3; c1 += 1) {
-; CHECK-NEXT:        for (int c3 = 0; c3 <= 1055; c3 += 1)
-; CHECK-NEXT:          for (int c4 = 256 * c1; c4 <= min(1022, 256 * c1 + 255); c4 += 1)
-; CHECK-NEXT:            CopyStmt_0(0, c3, c4);
+; CHECK-NEXT:        for (int c4 = 256 * c1; c4 <= min(1022, 256 * c1 + 255); c4 += 1)
+; CHECK-NEXT:          for (int c5 = 0; c5 <= 1055; c5 += 1)
+; CHECK-NEXT:            CopyStmt_0(0, c1, c4, c5);
 ; CHECK-NEXT:        for (int c2 = 0; c2 <= 10; c2 += 1) {
 ; CHECK-NEXT:          for (int c6 = 96 * c2; c6 <= 96 * c2 + 95; c6 += 1)
 ; CHECK-NEXT:            for (int c7 = 256 * c1; c7 <= min(1022, 256 * c1 + 255); c7 += 1)
Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
===================================================================
--- polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
+++ polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
@@ -1,7 +1,8 @@
 ; RUN: opt %loadPolly \
 ; RUN: -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-optree -polly-delicm -polly-simplify \
-; RUN: -polly-opt-isl -polly-tc-opt=true -debug -disable-output < %s 2>&1 \
+; RUN: -polly-opt-isl -polly-tc-opt=true -polly-ast-detect-parallel \
+; RUN: -polly-print-ast -disable-output < %s 2>&1 \
 ; RUN: | FileCheck %s
 ; REQUIRES: asserts
 
@@ -39,8 +40,77 @@
 ;      permutable: 1
 ;      coincident: [ 1, 1, 0 ]
 ;
-; CHECK: The tensor contraction pattern was detected
-; CHECK: The matrix multiplication pattern was detected
+; CHECK:    {
+; CHECK-NEXT:      // 1st level tiling - Tiles
+; CHECK-NEXT:      for (int c0 = 0; c0 <= 49; c0 += 1)
+; CHECK-NEXT:        for (int c1 = 0; c1 <= 56; c1 += 1) {
+; CHECK-NEXT:          // 1st level tiling - Points
+; CHECK-NEXT:          for (int c2 = 0; c2 <= 31; c2 += 1)
+; CHECK-NEXT:            for (int c3 = 0; c3 <= min(31, -32 * c1 + 1799); c3 += 1)
+; CHECK-NEXT:              Stmt_for_body3(32 * c0 + c2, 32 * c1 + c3);
+; CHECK-NEXT:        }
+; CHECK-NEXT:      // 1st level tiling - Tiles
+; CHECK-NEXT:      for (int c0 = 0; c0 <= 49; c0 += 1)
+; CHECK-NEXT:        for (int c1 = 0; c1 <= 56; c1 += 1) {
+; CHECK-NEXT:          // 1st level tiling - Points
+; CHECK-NEXT:          for (int c2 = 0; c2 <= 31; c2 += 1)
+; CHECK-NEXT:            for (int c3 = 0; c3 <= min(31, -32 * c1 + 1799); c3 += 1)
+; CHECK-NEXT:              Stmt_for_body3_last(32 * c0 + c2, 32 * c1 + c3);
+; CHECK-NEXT:        }
+; CHECK-NEXT:      // 1st level tiling - Tiles
+; CHECK-NEXT:      for (int c1 = 0; c1 <= 8; c1 += 1) {
+; CHECK-NEXT:        for (int c4 = 256 * c1; c4 <= min(2199, 256 * c1 + 255); c4 += 1)
+; CHECK-NEXT:          for (int c5 = 0; c5 <= 1799; c5 += 1)
+; CHECK-NEXT:            CopyStmt_0(0, c1, c4, c5);
+; CHECK-NEXT:        for (int c2 = 0; c2 <= 16; c2 += 1) {
+; CHECK-NEXT:          for (int c6 = 96 * c2; c6 <= min(1599, 96 * c2 + 95); c6 += 1)
+; CHECK-NEXT:            for (int c7 = 256 * c1; c7 <= min(2199, 256 * c1 + 255); c7 += 1)
+; CHECK-NEXT:              CopyStmt_1(0, c1, c2, c6, c7);
+; CHECK-NEXT:          // 1st level tiling - Points
+; CHECK-NEXT:          // Register tiling - Tiles
+; CHECK-NEXT:          for (int c3 = 0; c3 <= 224; c3 += 1)
+; CHECK-NEXT:            for (int c4 = 0; c4 <= min(23, -24 * c2 + 399); c4 += 1)
+; CHECK-NEXT:              for (int c5 = 0; c5 <= min(255, -256 * c1 + 2199); c5 += 1) {
+; CHECK-NEXT:                // Loop Vectorizer Disabled
+; CHECK-NEXT:                // Register tiling - Points
+; CHECK-NEXT:                {
+; CHECK-NEXT:                  Stmt_for_body8(96 * c2 + 4 * c4, 8 * c3, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body8(96 * c2 + 4 * c4, 8 * c3 + 1, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body8(96 * c2 + 4 * c4, 8 * c3 + 2, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body8(96 * c2 + 4 * c4, 8 * c3 + 3, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body8(96 * c2 + 4 * c4, 8 * c3 + 4, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body8(96 * c2 + 4 * c4, 8 * c3 + 5, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body8(96 * c2 + 4 * c4, 8 * c3 + 6, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body8(96 * c2 + 4 * c4, 8 * c3 + 7, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body8(96 * c2 + 4 * c4 + 1, 8 * c3, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body8(96 * c2 + 4 * c4 + 1, 8 * c3 + 1, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body8(96 * c2 + 4 * c4 + 1, 8 * c3 + 2, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body8(96 * c2 + 4 * c4 + 1, 8 * c3 + 3, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body8(96 * c2 + 4 * c4 + 1, 8 * c3 + 4, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body8(96 * c2 + 4 * c4 + 1, 8 * c3 + 5, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body8(96 * c2 + 4 * c4 + 1, 8 * c3 + 6, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body8(96 * c2 + 4 * c4 + 1, 8 * c3 + 7, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body8(96 * c2 + 4 * c4 + 2, 8 * c3, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body8(96 * c2 + 4 * c4 + 2, 8 * c3 + 1, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body8(96 * c2 + 4 * c4 + 2, 8 * c3 + 2, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body8(96 * c2 + 4 * c4 + 2, 8 * c3 + 3, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body8(96 * c2 + 4 * c4 + 2, 8 * c3 + 4, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body8(96 * c2 + 4 * c4 + 2, 8 * c3 + 5, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body8(96 * c2 + 4 * c4 + 2, 8 * c3 + 6, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body8(96 * c2 + 4 * c4 + 2, 8 * c3 + 7, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body8(96 * c2 + 4 * c4 + 3, 8 * c3, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body8(96 * c2 + 4 * c4 + 3, 8 * c3 + 1, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body8(96 * c2 + 4 * c4 + 3, 8 * c3 + 2, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body8(96 * c2 + 4 * c4 + 3, 8 * c3 + 3, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body8(96 * c2 + 4 * c4 + 3, 8 * c3 + 4, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body8(96 * c2 + 4 * c4 + 3, 8 * c3 + 5, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body8(96 * c2 + 4 * c4 + 3, 8 * c3 + 6, 256 * c1 + c5);
+; CHECK-NEXT:                  Stmt_for_body8(96 * c2 + 4 * c4 + 3, 8 * c3 + 7, 256 * c1 + c5);
+; CHECK-NEXT:                }
+; CHECK-NEXT:              }
+; CHECK-NEXT:        }
+; CHECK-NEXT:      }
+; CHECK-NEXT:    }
 ;
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll
===================================================================
--- polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll
+++ polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll
@@ -17,7 +17,6 @@
 ; CHECK-NOT: The matrix multiplication pattern was detected
 ; CHECK-NOT: The tensor contraction pattern was detected
 ; PATTERN-MATCHING-OPTS: The tensor contraction pattern was detected
-; PATTERN-MATCHING-OPTS: The matrix multiplication pattern was detected
 ; PARALLEL-AST-NOT: #pragma known-parallel
 ; STATS:  1 polly-opt-isl    - Number of matrix multiplication patterns detected and optimized
 ;
Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll
===================================================================
--- polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll
+++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll
@@ -18,7 +18,6 @@
 ; arrays.
 ;
 ; CHECK: The tensor contraction pattern was detected
-; CHECK: The matrix multiplication pattern was detected
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll
===================================================================
--- polly/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll
+++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll
@@ -13,9 +13,9 @@
 ; CHECK:               // 1st level tiling - Tiles
 ; CHECK-NEXT:          for (int c0 = 0; c0 <= 1; c0 += 1)
 ; CHECK-NEXT:            for (int c1 = 0; c1 <= 6; c1 += 1) {
-; CHECK-NEXT:              for (int c3 = 1536 * c0; c3 <= min(1999, 1536 * c0 + 1535); c3 += 1)
-; CHECK-NEXT:                for (int c4 = 307 * c1; c4 <= min(1999, 307 * c1 + 306); c4 += 1)
-; CHECK-NEXT:                  CopyStmt_0(0, c3, c4);
+; CHECK-NEXT:              for (int c4 = 307 * c1; c4 <= min(1999, 307 * c1 + 306); c4 += 1)
+; CHECK-NEXT:                for (int c5 = 1536 * c0; c5 <= min(1999, 1536 * c0 + 1535); c5 += 1)
+; CHECK-NEXT:                  CopyStmt_0(c0, c1, c4, c5);
 ; CHECK-NEXT:              for (int c2 = 0; c2 <= 24; c2 += 1) {
 ; CHECK-NEXT:                  for (int c6 = 80 * c2; c6 <= 80 * c2 + 79; c6 += 1)
 ; CHECK-NEXT:                    for (int c7 = 307 * c1; c7 <= min(1999, 307 * c1 + 306); c7 += 1)
Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll
===================================================================
--- polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll
+++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll
@@ -88,9 +88,9 @@
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:        }
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:      // 1st level tiling - Tiles
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:      for (int c1 = 0; c1 <= 3; c1 += 1) {
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:        for (int c3 = 0; c3 <= 1055; c3 += 1)
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:          for (int c4 = 256 * c1; c4 <= 256 * c1 + 255; c4 += 1)
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:            CopyStmt_0(0, c3, c4);
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:        for (int c4 = 256 * c1; c4 <= 256 * c1 + 255; c4 += 1)
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:          for (int c5 = 0; c5 <= 1055; c5 += 1)
+; EXTRACTION-OF-MACRO-KERNEL-NEXT:            CopyStmt_0(0, c1, c4, c5);
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:        for (int c2 = 0; c2 <= 10; c2 += 1) {
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:          for (int c6 = 96 * c2; c6 <= 96 * c2 + 95; c6 += 1)
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:            for (int c7 = 256 * c1; c7 <= 256 * c1 + 255; c7 += 1)
Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll
===================================================================
--- polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll
+++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll
@@ -22,13 +22,12 @@
 ;	   C[i][j] += A[i][k] * B[k][j];
 ;
 ; CHECK: The tensor contraction pattern was detected
-; CHECK: The matrix multiplication pattern was detected
 ;
 ; PATTERN-MATCHING-OPTS:    // 1st level tiling - Tiles
 ; PATTERN-MATCHING-OPTS-NEXT:    for (int c1 = 0; c1 <= 3; c1 += 1) {
-; PATTERN-MATCHING-OPTS-NEXT:      for (int c3 = 256 * c1; c3 <= 256 * c1 + 255; c3 += 1)
-; PATTERN-MATCHING-OPTS-NEXT:        for (int c4 = 0; c4 <= 1023; c4 += 1)
-; PATTERN-MATCHING-OPTS-NEXT:          CopyStmt_0(0, c3, c4);
+; PATTERN-MATCHING-OPTS-NEXT:      for (int c4 = 256 * c1; c4 <= 256 * c1 + 255; c4 += 1)
+; PATTERN-MATCHING-OPTS-NEXT:        for (int c5 = 0; c5 <= 1023; c5 += 1)
+; PATTERN-MATCHING-OPTS-NEXT:          CopyStmt_0(0, c1, c4, c5);
 ; PATTERN-MATCHING-OPTS-NEXT:      for (int c2 = 0; c2 <= 10; c2 += 1) {
 ; PATTERN-MATCHING-OPTS-NEXT:        for (int c6 = 96 * c2; c6 <= min(1023, 96 * c2 + 95); c6 += 1)
 ; PATTERN-MATCHING-OPTS-NEXT:          for (int c7 = 256 * c1; c7 <= 256 * c1 + 255; c7 += 1)
Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_5.ll
===================================================================
--- polly/test/ScheduleOptimizer/pattern-matching-based-opts_5.ll
+++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_5.ll
@@ -40,9 +40,9 @@
 ; CHECK-NEXT:            // 1st level tiling - Tiles
 ; CHECK-NEXT:            for (int c0 = 0; c0 <= floord(nj - 1, 2048); c0 += 1)
 ; CHECK-NEXT:              for (int c1 = 0; c1 <= floord(nk - 1, 256); c1 += 1) {
-; CHECK-NEXT:                for (int c3 = 2048 * c0; c3 <= min(nj - 1, 2048 * c0 + 2047); c3 += 1)
-; CHECK-NEXT:                  for (int c4 = 256 * c1; c4 <= min(nk - 1, 256 * c1 + 255); c4 += 1)
-; CHECK-NEXT:                    CopyStmt_0(0, c3, c4);
+; CHECK-NEXT:                for (int c4 = 256 * c1; c4 <= min(nk - 1, 256 * c1 + 255); c4 += 1)
+; CHECK-NEXT:                  for (int c5 = 2048 * c0; c5 <= min(nj - 1, 2048 * c0 + 2047); c5 += 1)
+; CHECK-NEXT:                    CopyStmt_0(c0, c1, c4, c5);
 ; CHECK-NEXT:                for (int c2 = 0; c2 <= floord(ni - 1, 96); c2 += 1) {
 ; CHECK-NEXT:                    for (int c6 = 96 * c2; c6 <= min(ni - 1, 96 * c2 + 95); c6 += 1)
 ; CHECK-NEXT:                      for (int c7 = 256 * c1; c7 <= min(nk - 1, 256 * c1 + 255); c7 += 1)
Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_6.ll
===================================================================
--- polly/test/ScheduleOptimizer/pattern-matching-based-opts_6.ll
+++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_6.ll
@@ -40,9 +40,9 @@
 ;
 ; CHECK:    // 1st level tiling - Tiles
 ; CHECK-NEXT:    for (int c1 = 0; c1 <= 3; c1 += 1) {
-; CHECK-NEXT:      for (int c3 = 0; c3 <= 1019; c3 += 1)
-; CHECK-NEXT:        for (int c4 = 256 * c1; c4 <= min(1019, 256 * c1 + 255); c4 += 1)
-; CHECK-NEXT:          CopyStmt_0(0, c3, c4);
+; CHECK-NEXT:      for (int c4 = 256 * c1; c4 <= min(1019, 256 * c1 + 255); c4 += 1)
+; CHECK-NEXT:        for (int c5 = 0; c5 <= 1019; c5 += 1)
+; CHECK-NEXT:          CopyStmt_0(0, c1, c4, c5);
 ; CHECK-NEXT:      for (int c2 = 0; c2 <= 10; c2 += 1) {
 ; CHECK-NEXT:        for (int c6 = 96 * c2; c6 <= min(1019, 96 * c2 + 95); c6 += 1)
 ; CHECK-NEXT:          for (int c7 = 256 * c1; c7 <= min(1019, 256 * c1 + 255); c7 += 1)
Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_7.ll
===================================================================
--- polly/test/ScheduleOptimizer/pattern-matching-based-opts_7.ll
+++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_7.ll
@@ -24,9 +24,9 @@
 ;
 ; CHECK:    // 1st level tiling - Tiles
 ; CHECK-NEXT:    for (int c1 = 0; c1 <= 2; c1 += 1) {
-; CHECK-NEXT:      for (int c3 = 0; c3 <= 1023; c3 += 1)
-; CHECK-NEXT:        for (int c4 = 384 * c1; c4 <= min(1023, 384 * c1 + 383); c4 += 1)
-; CHECK-NEXT:          CopyStmt_0(0, c3, c4);
+; CHECK-NEXT:      for (int c4 = 384 * c1; c4 <= min(1023, 384 * c1 + 383); c4 += 1)
+; CHECK-NEXT:        for (int c5 = 0; c5 <= 1023; c5 += 1)
+; CHECK-NEXT:          CopyStmt_0(0, c1, c4, c5);
 ; CHECK-NEXT:      for (int c2 = 0; c2 <= 7; c2 += 1) {
 ; CHECK-NEXT:        for (int c6 = 128 * c2; c6 <= 128 * c2 + 127; c6 += 1)
 ; CHECK-NEXT:          for (int c7 = 384 * c1; c7 <= min(1023, 384 * c1 + 383); c7 += 1)
Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_8.ll
===================================================================
--- polly/test/ScheduleOptimizer/pattern-matching-based-opts_8.ll
+++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_8.ll
@@ -25,9 +25,9 @@
 ;
 ; CHECK:    // 1st level tiling - Tiles
 ; CHECK-NEXT:    for (int c1 = 0; c1 <= 3; c1 += 1) {
-; CHECK-NEXT:      for (int c3 = 0; c3 <= 1023; c3 += 1)
-; CHECK-NEXT:        for (int c4 = 256 * c1; c4 <= 256 * c1 + 255; c4 += 1)
-; CHECK-NEXT:          CopyStmt_0(0, c3, c4);
+; CHECK-NEXT:      for (int c4 = 256 * c1; c4 <= 256 * c1 + 255; c4 += 1)
+; CHECK-NEXT:        for (int c5 = 0; c5 <= 1023; c5 += 1)
+; CHECK-NEXT:          CopyStmt_0(0, c1, c4, c5);
 ; CHECK-NEXT:      for (int c2 = 0; c2 <= 10; c2 += 1) {
 ; CHECK-NEXT:        for (int c6 = 96 * c2; c6 <= min(1023, 96 * c2 + 95); c6 += 1)
 ; CHECK-NEXT:          for (int c7 = 256 * c1; c7 <= 256 * c1 + 255; c7 += 1)