Index: polly/lib/Transform/MatmulOptimizer.cpp
===================================================================
--- polly/lib/Transform/MatmulOptimizer.cpp
+++ polly/lib/Transform/MatmulOptimizer.cpp
@@ -13,10 +13,13 @@
 #include "polly/ScopInfo.h"
 #include "polly/ScopPass.h"
 #include "polly/Simplify.h"
+#include "polly/Support/GICHelper.h"
 #include "polly/Support/ISLTools.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
@@ -126,6 +129,18 @@
              "macro-kernel, by Nr, the parameter of the micro-kernel"),
     cl::Hidden, cl::init(256), cl::ZeroOrMore, cl::cat(PollyCategory));
 
+static cl::opt<bool>
+    PMBasedTCOpts("polly-tc-opt",
+                  cl::desc("Perform optimizations of tensor contractions based "
+                           "on pattern matching"),
+                  cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+static cl::opt<bool>
+    PMBasedMMMOpts("polly-matmul-opt",
+                   cl::desc("Perform optimizations of matrix multiplications "
+                            "based on pattern matching"),
+                   cl::init(true), cl::ZeroOrMore, cl::cat(PollyCategory));
+
 namespace {
 /// Parameters of the micro kernel.
 ///
@@ -160,6 +175,42 @@
   int k = -1;
 };
 
+/// Parameters of the tensor contraction operands.
+///
+/// Parameters, which describe access relations that represent operands of the
+/// tensor contraction.
+struct TCInfoTy {
+  // Memory accesses that represent reading from tensors, which are operands of
+  // the tensor contraction.
+  MemoryAccess *A = nullptr;
+  MemoryAccess *B = nullptr;
+  // Memory accesses that represent reading from and writing into the tensor,
+  // which contains the result of the tensor contraction.
+  MemoryAccess *ReadFromC = nullptr;
+  MemoryAccess *WriteToC = nullptr;
+  // Input dimensions of the schedule space, which represent free
+  // indices of tensors.
+  SmallDenseSet<int> I;
+  SmallDenseSet<int> J;
+  // Input dimensions of the schedule space, which represent contracted
+  // indices of tensors.
+  SmallDenseSet<int> P;
+  // Sizes of tensor dimensions for corresponding input dimensions of
+  // the schedule space. The size of the tensor dimension can be larger than
+  // the size of the corresponding input dimension of the schedule space.
+  // This does not correspond to a tensor contraction. However, such a pattern
+  // will be optimized by the transformation.
+  SmallVector<int> DimensionSizes;
+  SmallVector<int> ADimensions;
+  SmallVector<int> BDimensions;
+  SmallVector<int> CDimensions;
+  // Permutations of indices of I, J, and P, which describe operands of
+  // the tensor contraction and its result.
+  SmallVector<int> OrderedI;
+  SmallVector<int> OrderedJ;
+  SmallVector<int> OrderedP;
+};
+
 /// Create an isl::union_set, which describes the option of the form
 /// [isolate[] -> unroll[x]].
 ///
@@ -1024,14 +1075,579 @@
   return false;
 }
 
+/// Get the dimension size.
+///
+/// Return the size of the dimension @p Pos, which is obtained from @p SAI.
+/// Return -1 in the case of the first dimension of a multi-dimensional array,
+/// since the ScopArrayInfo class does not carry size information.
+///
+/// @param SAI The information about the array.
+/// @param Pos The position of the dimension.
+/// @return The size of the dimension.
+static int getDimSize(const ScopArrayInfo *SAI, unsigned Pos) {
+  if (Pos == 0)
+    return -1;
+  const llvm::SCEV *SCEVDimSize = SAI->getDimensionSize(Pos);
+  assert(SCEVDimSize);
+  auto *ConstantDimSize = dyn_cast<const SCEVConstant>(SCEVDimSize);
+  assert(ConstantDimSize);
+  auto *IntDimSize = dyn_cast<ConstantInt>(ConstantDimSize->getValue());
+  assert(IntDimSize);
+  return IntDimSize->getSExtValue();
+}
+
+/// Check whether the access relation has the specified form.
+///
+/// Check that the access relation @p AccMap has the form T[I0, …, In], where
+/// indexes I0, …, In are specified by @p Dimensions.
+///
+/// @param Domain     The domain of the access relation.
+/// @param AccMap     The access relation to be checked.
+/// @param Dimensions The permutation of the subset of the input dimensions.
+/// @return True if @p AccMap has the expected form and false,
+///         otherwise.
+static bool isCorrectAccessMap(isl::set Domain, isl::map AccMap,
+                               ArrayRef<int> Dimensions) {
+  isl::space Space = AccMap.get_space();
+  if (unsignedFromIslSize(Space.dim(isl::dim::out)) != Dimensions.size())
+    return false;
+  isl::map Universe = isl::map::universe(Space);
+
+  // Create an access relation of the following form:
+  // [I0, …, Im] -> [Il, …, In], where indexes
+  // Il, …, In are specified by @p Dimensions.
+  isl::map PossibleTensor = isl::manage(Universe.copy());
+  for (unsigned i = 0; i < Dimensions.size(); i++) {
+    const int InPos = Dimensions[i];
+    if (InPos == -1)
+      return false;
+    PossibleTensor =
+        PossibleTensor.equate(isl::dim::in, InPos, isl::dim::out, i);
+  }
+
+  AccMap = AccMap.intersect_domain(Domain);
+  PossibleTensor = PossibleTensor.intersect_domain(Domain);
+
+  // If AccMap spans entire domain (Non-partial write),
+  // compute FirstPos and SecondPos.
+  // If AccMap != PossibleTensor here (the two maps have been gisted at
+  // this point), it means that the writes are not complete, or in other
+  // words, it is a Partial write and Partial writes must be rejected.
+  return AccMap.is_equal(PossibleTensor);
+}
+
+/// Check whether the access represents the tensor contraction operand.
+///
+/// Check that the access relation @p AccMap has the form T[i1, …, in].
+/// Obtained indexes i1, …, in, their sizes and their permutation are stored
+/// into @p  IndexSet, @p DimensionSizes, and @p Dimensions, respectively.
+///
+/// @param Domain         The domain of the access relation.
+/// @param AccMap         The access relation to be checked.
+/// @param IndexSet       The subset of the input dimensions.
+/// @param DimensionSizes Sizes of the input dimensions of @p Dimensions.
+/// @param Dimensions     The permutation of the subset of the input dimensions.
+/// @return True if @p AccMap has the expected form and false,
+///         otherwise.
+static bool isTCOperandAcc(isl::set Domain, isl::map AccMap,
+                           SmallDenseSet<int> &IndexSet,
+                           SmallVectorImpl<int> &DimensionSizes,
+                           SmallVectorImpl<int> &Dimensions) {
+  isl::id Id = AccMap.get_tuple_id(isl::dim::out);
+  const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id);
+  assert(SAI && "AccMap should represent memory access");
+
+  // Fix values of output dimensions with respect to their positions.
+  isl::map CheckMap = isl::manage(AccMap.copy());
+  unsigned OutDimNum = unsignedFromIslSize(CheckMap.dim(isl::dim::out));
+  for (unsigned i = 0; i < OutDimNum; i++)
+    CheckMap = CheckMap.fix_si(isl::dim::out, i, i);
+
+  // In the case of the tensor contraction, values of output dimensions are
+  // fixed and form a permutation of a subset of values of input dimensions.
+  // Try to obtain the permutation and sizes of corresponding input dimensions.
+  Dimensions.assign(OutDimNum, -1);
+  for (unsigned i : rangeIslSize(0, CheckMap.dim(isl::dim::in))) {
+    isl::val Val = isl::manage(
+        isl_map_plain_get_val_if_fixed(CheckMap.get(), isl_dim_in, i));
+    if (!Val.is_int())
+      continue;
+    int OutPos = -1;
+    llvm::APInt ValAPInt = APIntFromVal(Val);
+    if (ValAPInt.isSignedIntN(32))
+      OutPos = ValAPInt.getSExtValue();
+    assert((OutPos != -1) && (OutPos < static_cast<int>(OutDimNum)));
+    if (IndexSet.count(i))
+      return false;
+    IndexSet.insert(i);
+    Dimensions[OutPos] = i;
+    if (DimensionSizes[i] <= 0)
+      DimensionSizes[i] = getDimSize(SAI, OutPos);
+  }
+
+  return isCorrectAccessMap(Domain, AccMap, Dimensions);
+}
+
+/// Find the intersection of two sets.
+///
+/// Find the intersection of the set @p A and the set @p B.
+///
+/// @param A, B Sets to intersect.
+/// @return The set intersection.
+static SmallDenseSet<int> intersect(const SmallDenseSet<int> &A,
+                                    const SmallDenseSet<int> &B) {
+  SmallDenseSet<int> Intersection = A;
+  set_intersect(Intersection, B);
+  return Intersection;
+}
+
+/// Determine the access that writes to the tensor, which contains
+/// the result of the tensor contraction.
+///
+/// @param Domain        The domain of the statement.
+/// @param Stmt          The statement, which writes to memory.
+/// @param TCI           The information about the tensor contraction.
+/// @param IandJIndexSet The set, which contains free indexes of tensors.
+/// @return The determined MemoryAccess, or nullptr if there is no necessary
+///         access within the SCoP.
+static MemoryAccess *getWriteAccess(isl::set Domain, ScopStmt *Stmt,
+                                    TCInfoTy &TCI,
+                                    SmallDenseSet<int> &IandJIndexSet) {
+  TCI.WriteToC = nullptr;
+  SmallVector<MemoryAccess *, 32> Accesses = getAccessesInOrder(*Stmt);
+  for (MemoryAccess *MemA : reverse(Accesses)) {
+    if (!MemA->isLatestArrayKind())
+      continue;
+    // The last memory access should be a write memory access.
+    if (!MemA->isWrite())
+      return nullptr;
+
+    isl::map AccMap = MemA->getLatestAccessRelation();
+    if (!isTCOperandAcc(Domain, AccMap, IandJIndexSet, TCI.DimensionSizes,
+                        TCI.CDimensions))
+      return nullptr;
+
+    return MemA;
+  }
+  return nullptr;
+}
+
+/// Determine an access, which reads elements of an operand of the tensor
+/// contraction
+///
+/// @param MemAccessPtr  The access, which reads elements of the tensor.
+/// @param IndexSet      The set, which contains indexes of the tensors.
+/// @param IandJIndexSet The set, which contains free indexes of tensors.
+/// @param Dimensions    The permutation of the subset of the input dimensions.
+/// @param TCI           The information about the tensor contraction.
+/// @return True if the memory access @p MemAccessPtr corresponds
+///         to the tensor contraction.
+static bool setReadAccess(MemoryAccess *MemAccessPtr,
+                          const SmallDenseSet<int> &IndexSet,
+                          const SmallDenseSet<int> &IandJIndexSet,
+                          SmallVector<int> Dimensions, TCInfoTy &TCI) {
+  if (!TCI.A) {
+    // Probably IndexSet is a union of I and P sets.
+    if (intersect(IndexSet, TCI.P).size() != TCI.P.size())
+      return false;
+
+    // Obtain the set I.
+    TCI.I = std::move(set_difference(IndexSet, TCI.P));
+    if (intersect(IandJIndexSet, TCI.I).size() != TCI.I.size())
+      return false;
+
+    // Obtain the set J.
+    TCI.J = std::move(set_difference(IandJIndexSet, TCI.I));
+
+    // Set the first operand of the tensor contraction.
+    TCI.A = MemAccessPtr;
+    TCI.ADimensions = std::move(Dimensions);
+    return true;
+  }
+
+  if (!TCI.B) {
+    // IndexSet should be a union of J and P sets.
+    if (!(intersect(IndexSet, TCI.J).size() == TCI.J.size()) &&
+        (intersect(IndexSet, TCI.P).size() == TCI.P.size()) &&
+        (IndexSet.size() == TCI.P.size() + TCI.J.size()))
+      return false;
+
+    // Set the second operand of the tensor contraction.
+    TCI.B = MemAccessPtr;
+    TCI.BDimensions = std::move(Dimensions);
+    return true;
+  }
+
+  return false;
+}
+
+/// Check that all memory accesses of the statement, except from the last
+/// one, are read memory accesses, which read elements of operands of the tensor
+/// contraction and its result.
+///
+/// @param Domain        The domain of the statement.
+/// @param Stmt          The statement, which writes to memory.
+/// @param TCI           The information about the tensor contraction.
+/// @param IandJIndexSet The set, which contains free indexes of tensors.
+/// @return True if all read memory accesses of the statement @p Stmt correspond
+///         to the tensor contraction.
+static bool setReadAccesses(isl::set Domain, ScopStmt *Stmt, TCInfoTy &TCI,
+                            SmallDenseSet<int> &IandJIndexSet) {
+  TCI.A = nullptr;
+  TCI.B = nullptr;
+  TCI.ReadFromC = nullptr;
+  SmallVector<MemoryAccess *, 32> Accesses = getAccessesInOrder(*Stmt);
+  for (auto *MemA = Accesses.begin(); *MemA != TCI.WriteToC; MemA++) {
+    MemoryAccess *MemAccessPtr = *MemA;
+    if (!MemAccessPtr->isLatestArrayKind())
+      continue;
+
+    // All memory accesses, except from the last one, should be read memory
+    // accesses.
+    if (MemAccessPtr->isWrite())
+      return false;
+
+    // There is only one memory access, which reads elements of the result of
+    // the tensor contraction.
+    isl::map AccMap = MemAccessPtr->getLatestAccessRelation();
+    if (AccMap.is_equal(TCI.WriteToC->getLatestAccessRelation())) {
+      if (TCI.ReadFromC)
+        return false;
+      TCI.ReadFromC = MemAccessPtr;
+      continue;
+    }
+
+    SmallVector<int> Dimensions;
+    SmallDenseSet<int> IndexSet;
+    if (!isTCOperandAcc(Domain, AccMap, IndexSet, TCI.DimensionSizes,
+                        Dimensions))
+      return false;
+
+    if (!setReadAccess(MemAccessPtr, IndexSet, IandJIndexSet, Dimensions, TCI))
+      return false;
+  }
+
+  // Check that there are read memory accesses, which read elements of operands
+  // of the tensor contraction and its result.
+  return TCI.ReadFromC && TCI.A && TCI.B;
+}
+
+/// Check accesses to operands of the tensor contraction.
+///
+/// Check that accesses of the SCoP statement, which corresponds to
+/// the partial schedule @p PartialSchedule, represent accesses
+/// to the non-scalar operands of the tensor contraction.
+///
+/// @param  Domain          The domain of the SCoP statement.
+/// @param  PartialSchedule The partial schedule of the SCoP statement.
+/// @param  TCI             Parameters of the tensor contraction operands.
+/// @return                 True if the corresponding SCoP statement
+///                         represents tensor contraction and false,
+///                         otherwise.
+static bool containsOnlyTCAcc(isl::set Domain, isl::map PartialSchedule,
+                              TCInfoTy &TCI) {
+  isl::id InputDimsId = PartialSchedule.get_tuple_id(isl::dim::in);
+  ScopStmt *Stmt = static_cast<ScopStmt *>(InputDimsId.get_user());
+
+  // In region statements, the order of memory accesses execution is not
+  // predictable at compile-time.
+  if ((Stmt->size() <= 1) || Stmt->isRegionStmt())
+    return false;
+
+  unsigned DimNum = unsignedFromIslSize(PartialSchedule.dim(isl::dim::in));
+  TCI.DimensionSizes.resize(DimNum);
+  SmallDenseSet<int> IandJIndexSet;
+
+  TCI.WriteToC = getWriteAccess(Domain, Stmt, TCI, IandJIndexSet);
+  if (!TCI.WriteToC)
+    return false;
+
+  if (intersect(IandJIndexSet, TCI.P).size() != 0)
+    return false;
+
+  if (!setReadAccesses(Domain, Stmt, TCI, IandJIndexSet))
+    return false;
+
+  return true;
+}
+
+/// Check that dependency corresponds to the tensor contraction.
+///
+/// Check that the dependency has the form
+/// S(..., ki, max(k(i + 1)), ..., max(kn), ...) ->
+/// S(..., ki + 1, min(k(i + 1)), ..., min(kn), ...), where S is the SCoP
+/// statement. For this purpose, we analyze the set @p DepDelta, which
+/// represents the differences between image elements and domain elements of
+/// the corresponding map.
+///
+/// @param  DepDelta    The set contains the differences between image elements
+///                     and corresponding domain elements of the map, which
+///                     represents the dependency.
+/// @param  Pos         The position of the index ki.
+/// @param  BoundDeltas In the case of indexes of ki, the difference between
+///                     image elements and corresponding domain elements
+///                     corresponds to the difference between lexicographic
+///                     minimum and lexicographic maximum of the corresponding
+///                     dimension of the domain of the statement.
+/// @param  IndexSet    Obtained indexes ki, which describe the dependency.
+/// @return True if dependencies correspond to the tensor contraction
+///         and false, otherwise.
+static bool isTcDep(isl::set DepDelta, unsigned Pos, isl::set BoundDeltas,
+                    SmallDenseSet<int> *IndexSet) {
+  // Check the difference between the image element and the domain element
+  // in the case of the index ki.
+  if (!DepDelta.plain_get_val_if_fixed(isl::dim::set, Pos).is_one())
+    return false;
+
+  // Image elements and corresponding domain elements should be equal in
+  // the case of positions, which are lower than the specified position.
+  for (unsigned j = 0; j < Pos; j++)
+    if (!DepDelta.plain_get_val_if_fixed(isl::dim::set, j).is_zero())
+      return false;
+
+  // Compute a set, which is used to analyze how values of
+  // the domain are related to the map that describes the dependency.
+  isl::map DepDeltaNegToBoundDeltas = isl::map::from_domain_and_range(
+      isl::manage(isl_set_neg(DepDelta.copy())), BoundDeltas);
+  isl::set Complement = DepDeltaNegToBoundDeltas.deltas();
+
+  const unsigned DimNum = unsignedFromIslSize(DepDelta.dim(isl::dim::set));
+  for (unsigned j = Pos + 1; j < DimNum; j++) {
+    if (!IndexSet->count(j)) {
+      // Check the difference between the image element and the domain element
+      // in the case of indexes, which do not describe the dependency.
+      if (DepDelta.plain_get_val_if_fixed(isl::dim::set, j).is_zero())
+        continue;
+      return false;
+    }
+
+    // In the case of other indexes, which describe the dependency,
+    // the difference between the image element and the domain element
+    // should be equal to the difference between lexicographic minimum and
+    // lexicographic maximum of the domain of the statement.
+    if (!Complement.plain_get_val_if_fixed(isl::dim::set, j).is_zero())
+      return false;
+  }
+
+  return true;
+}
+
+/// Check that dependencies correspond to the tensor contraction.
+///
+/// Check that there are only true dependencies of the form
+/// S(..., ki, max(k(i + 1)), ..., max(kn), ...) ->
+/// S(..., ki + 1, min(k(i + 1)), ..., min(kn), ...), where S is the SCoP
+/// statement represented by @p Schedule. Such dependencies are produced by
+/// the tensor contraction. Obtained indexes ki are stored into @p IndexSet.
+///
+/// @param  Schedule The schedule of the SCoP statement.
+/// @param  D        The SCoP dependencies.
+/// @param  Domain   The domain of the statement.
+/// @param  IndexSet Obtained indexes ki, which describe the dependencies.
+/// @return True if dependencies correspond to the tensor contraction
+///         and false, otherwise.
+static bool containsOnlyTcDeps(isl::map Schedule, const Dependences *D,
+                               SmallDenseSet<int> *IndexSet, isl::set Domain) {
+  isl::union_map Dep = D->getDependences(Dependences::TYPE_RAW);
+  isl::union_map Red = D->getDependences(Dependences::TYPE_RED);
+  if (!Red.is_null())
+    Dep = Dep.unite(Red);
+
+  isl::space DomainSpace = Schedule.get_space().domain();
+  isl::space Space = DomainSpace.map_from_domain_and_range(DomainSpace);
+  isl::set DepDeltas = Dep.extract_map(Space).deltas();
+  unsigned DeltasDimNum = unsignedFromIslSize(DepDeltas.dim(isl::dim::set));
+  isl::pw_multi_aff LowerBound = Domain.lexmin_pw_multi_aff();
+  isl::pw_multi_aff BoundSub = Domain.lexmax_pw_multi_aff().sub(LowerBound);
+  auto BoundDeltas = isl::manage(isl_set_from_pw_multi_aff(BoundSub.release()));
+
+  for (int i = static_cast<int>(DeltasDimNum) - 1; i >= 0; i--) {
+    // In the case of the tensor contraction, the difference between image
+    // elements and domain elements lies on a hyperplane where a dimension
+    // has the fixed value one.
+    isl::set Intersection = DepDeltas.fix_si(isl::dim::set, i, 1);
+    if (Intersection.is_empty())
+      continue;
+
+    if (!isTcDep(Intersection, i, BoundDeltas, IndexSet))
+      return false;
+
+    IndexSet->insert(i);
+    DepDeltas = DepDeltas.subtract(Intersection);
+  }
+
+  // In the case of the tensor contraction, all dependencies should have
+  // the previously described form.
+  if ((DeltasDimNum == 0) || !DepDeltas.is_empty())
+    return false;
+
+  return true;
+}
+
+/// Order the dimensions of operands of the tensor contraction.
+///
+/// To improve the spatial locality of the data, we use a heuristic, which is
+/// introduced in [1]. The heuristic logically reorders the tensor dimensions
+/// and consists of the following steps:
+///
+/// 1. Sort the dimensions in I and J by increasing stride in the tensor C.
+/// 2. If tensor A contains the last dimension of C, swap A with B and I with J.
+/// 3. Sort the dimensions in P by increasing stride in the tensor A.
+///
+/// [1] - Devin Matthews. High-Performance Tensor Contraction without BLAS.
+/// SIAM Journal on Scientific Computing, 40(1). 2016. DOI: 10.1137/16M108968X.
+///
+/// @param TCI The information about tensors.
+static void orderDimensions(TCInfoTy &TCI) {
+  // Sort the dimensions in I and J by increasing stride in the tensor C.
+  for (const int &Dimension : TCI.CDimensions) {
+    assert(!(TCI.I.count(Dimension) && TCI.J.count(Dimension)));
+
+    if (TCI.I.count(Dimension)) {
+      TCI.OrderedI.push_back(Dimension);
+      continue;
+    }
+
+    if (TCI.J.count(Dimension))
+      TCI.OrderedJ.push_back(Dimension);
+  }
+
+  // If tensor A contains the last dimension of C, swap A with B and I with J.
+  if (TCI.I.count(TCI.CDimensions.back())) {
+    MemoryAccess *Access = TCI.A;
+    TCI.A = TCI.B;
+    TCI.B = Access;
+    TCI.I.swap(TCI.J);
+    TCI.ADimensions.swap(TCI.BDimensions);
+    TCI.OrderedI.swap(TCI.OrderedJ);
+  }
+
+  // Sort the dimensions in P by increasing stride in the tensor A.
+  for (const int &Dimension : TCI.ADimensions)
+    if (TCI.P.count(Dimension))
+      TCI.OrderedP.push_back(Dimension);
+}
+
+/// Check if the SCoP statement could probably be optimized with analytical
+/// modeling.
+///
+/// containsTC tries to determine whether the following conditions
+/// are true:
+///
+/// 1. The last memory access modeling an array, MA1, represents writing to
+///    memory and has the form S(..., I, ..., J, ...) -> M(shuffle(I, J)),
+///    where S is the SCoP statement under consideration and shuffle(I, J)
+///    is a permutation of indexes of sets I and J.
+/// 2. There are only true dependencies of the form
+///    S(..., ki, max(k(i + 1)), ..., max(kn), ...) ->
+///    S(..., ki + 1, min(k(i + 1)), ..., min(kn), ...), where S is the SCoP
+///    statement represented by @p Schedule and ki are indexes of the set P.
+/// 3. SCoP contains only three access relations, MA2, MA3, and MA4 that
+///    represent reading from memory and have the form
+///    S(..., I, ..., P, ...) -> M(shuffle(I, P)),
+///    S(..., P, ..., J, ...) -> M(shuffle(J, P)),
+///    S(...) -> M(shuffle(I, J)), respectively.
+///
+/// @param  PartialSchedule The PartialSchedule that contains a SCoP statement
+///                         to check.
+/// @param  D               The SCoP dependencies.
+/// @param  TCI             Parameters of the tensor contraction operands.
+/// @param  Domain          The domain of the statement.
+/// @return True if dependencies and memory accesses correspond to the tensor
+///              contraction and false, otherwise.
+static bool containsTCInfoTy(isl::map PartialSchedule, const Dependences *D,
+                             TCInfoTy &TCI, isl::set Domain) {
+  if (!containsOnlyTcDeps(PartialSchedule, D, &TCI.P, Domain))
+    return false;
+
+  // TODO: handle cases of scalar multiplication if needed.
+  if (TCI.P.size() == 0)
+    return false;
+
+  if (!containsOnlyTCAcc(Domain, PartialSchedule, TCI))
+    return false;
+
+  // TODO: handle cases of GEMV if needed.
+  if ((TCI.I.size() == 0) || (TCI.J.size() == 0))
+    return false;
+
+  orderDimensions(TCI);
+
+  return true;
+}
+
+/// Check if this node contains a partial schedule that could
+/// probably be optimized with analytical modeling.
+///
+/// isTCPattern tries to determine whether the following conditions
+/// are true:
+/// 1. the partial schedule contains only one statement.
+/// 2. there are exactly three input dimensions.
+/// 3. there are four memory accesses that represent accesses to tensor
+///    contraction operand.
+/// 4. all memory accesses of the statement except from the last one, are
+///    read memory accesses and the last one is a write memory access.
+/// 5. all subscripts of the last memory access of the statement do not
+///    contain the variable used in the inner loop.
+///
+/// If this is the case, we could use an approach that is similar to
+/// the one used to get close-to-peak performance of matrix multiplications.
+///
+/// @param Node The node to check.
+/// @param D    The SCoP dependencies.
+/// @param TCI  Parameters of the tensor contraction operands.
+static bool isTCPattern(isl::schedule_node Node, const Dependences *D,
+                        TCInfoTy &TCI) {
+  Node = Node.child(0);
+  auto LeafType = isl_schedule_node_get_type(Node.get());
+  isl::union_map PartialSchedule = Node.get_prefix_schedule_union_map();
+  isl::union_set Domain = Node.domain();
+  Node = Node.parent();
+
+  // The innermost band node is expected.
+  if (LeafType != isl_schedule_node_leaf ||
+      isl_union_map_n_map(PartialSchedule.get()) != 1)
+    return false;
+
+  // The partial schedule should contain only one statement.
+  if (isl_union_set_n_set(Domain.get()) != 1)
+    return false;
+
+  auto HasFilterNode = false;
+  auto NodeType = isl_schedule_node_get_type(Node.get());
+
+  // Check that all predecessors of the node contain all band nodes
+  // for the statement, which represents the tensor contraction.
+  // Subsequently, such band nodes will be replaced by a single band node.
+  while (NodeType != isl_schedule_node_domain) {
+    if (HasFilterNode && (NodeType == isl_schedule_node_band))
+      return false;
+
+    if (NodeType == isl_schedule_node_filter)
+      HasFilterNode = true;
+
+    Node = Node.parent();
+    NodeType = isl_schedule_node_get_type(Node.get());
+  }
+
+  isl::map PartialScheduleMap = isl::map::from_union_map(PartialSchedule);
+  if (containsTCInfoTy(PartialScheduleMap, D, TCI, isl::set(Domain)))
+    return true;
+
+  return false;
+}
+
 } // namespace
 
 isl::schedule_node
 polly::tryOptimizeMatMulPattern(isl::schedule_node Node,
                                 const llvm::TargetTransformInfo *TTI,
                                 const Dependences *D) {
+  TCInfoTy TCI;
+  if (PMBasedTCOpts && isTCPattern(Node, D, TCI))
+    LLVM_DEBUG(dbgs() << "The tensor contraction pattern was detected\n");
   MatMulInfoTy MMI;
-  if (isMatrMultPattern(Node, D, MMI)) {
+  if (PMBasedMMMOpts && isMatrMultPattern(Node, D, MMI)) {
     LLVM_DEBUG(dbgs() << "The matrix multiplication pattern was detected\n");
     return optimizeMatMulPattern(Node, TTI, MMI);
   }
Index: polly/lib/Transform/ScheduleOptimizer.cpp
===================================================================
--- polly/lib/Transform/ScheduleOptimizer.cpp
+++ polly/lib/Transform/ScheduleOptimizer.cpp
@@ -302,6 +302,16 @@
   /// @param Node The node to check.
   static bool isTileableBandNode(isl::schedule_node Node);
 
+  /// Check if this node is a band node we want to transform using pattern
+  /// matching.
+  ///
+  /// We look for innermost band nodes where individual dimensions are marked as
+  /// permutable. There is no restriction on the number of individual
+  /// dimensions.
+  ///
+  /// @param Node The node to check.
+  static bool isPMOptimizableBandNode(isl::schedule_node Node);
+
   /// Pre-vectorizes one scheduling dimension of a schedule band.
   ///
   /// prevectSchedBand splits out the dimension DimToVectorize, tiles it and
@@ -446,7 +456,10 @@
   return true;
 }
 
-bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node) {
+/// Check if this node is a permutable band node, which has only one child.
+///
+/// @param Node The node to check.
+static bool isPermutableOneTimeParentBandNode(isl::schedule_node Node) {
   if (isl_schedule_node_get_type(Node.get()) != isl_schedule_node_band)
     return false;
 
@@ -456,6 +469,13 @@
   if (!isl_schedule_node_band_get_permutable(Node.get()))
     return false;
 
+  return true;
+}
+
+bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node) {
+  if (!isPermutableOneTimeParentBandNode(Node))
+    return false;
+
   auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
 
   if (unsignedFromIslSize(Space.dim(isl::dim::set)) <= 1u)
@@ -464,6 +484,13 @@
   return isSimpleInnermostBand(Node);
 }
 
+bool ScheduleTreeOptimizer::isPMOptimizableBandNode(isl::schedule_node Node) {
+  if (!isPermutableOneTimeParentBandNode(Node))
+    return false;
+
+  return isSimpleInnermostBand(Node);
+}
+
 __isl_give isl::schedule_node
 ScheduleTreeOptimizer::applyTileBandOpt(isl::schedule_node Node) {
   if (FirstLevelTiling) {
@@ -509,10 +536,8 @@
   assert(OAI && "Expecting optimization options");
 
   isl::schedule_node Node = isl::manage(NodeArg);
-  if (!isTileableBandNode(Node))
-    return Node.release();
 
-  if (OAI->PatternOpts) {
+  if (OAI->PatternOpts && isPMOptimizableBandNode(Node)) {
     isl::schedule_node PatternOptimizedSchedule =
         tryOptimizeMatMulPattern(Node, OAI->TTI, OAI->D);
     if (!PatternOptimizedSchedule.is_null()) {
@@ -521,6 +546,9 @@
     }
   }
 
+  if (!isTileableBandNode(Node))
+    return Node.release();
+
   if (OAI->Postopts)
     Node = applyTileBandOpt(Node);
 
Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
===================================================================
--- polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
+++ polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
@@ -1,7 +1,7 @@
 ; RUN: opt %loadPolly \
 ; RUN: -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-optree -polly-delicm -polly-simplify \
-; RUN: -polly-opt-isl -debug < %s 2>&1 \
+; RUN: -polly-opt-isl -polly-tc-opt=true -debug < %s 2>&1 \
 ; RUN: | FileCheck %s
 ; REQUIRES: asserts
 
@@ -10,6 +10,36 @@
 ; is not through the original memory access, but trough a PHI node that was
 ; delicmed. This test covers the polybench 2mm and 3mm cases.
 ;
+; This test case generates the following schedule, which contans filters:
+;
+; domain: "{ Stmt_for_body8[i0, i1, i2]  : 0 <= i0 <= 1599 and
+;                                          0 <= i1 <= 1799 and
+;                                          0 <= i2 <= 2199;
+;            Stmt_for_body3[i0, i1] :      0 <= i0 <= 1599 and
+;                                          0 <= i1 <= 1799;
+;            Stmt_for_body3_last[i0, i1] : 0 <= i0 <= 1599 and
+;                                          0 <= i1 <= 1799 }"
+; child:
+;  sequence:
+;  - filter: "{ Stmt_for_body3[i0, i1] }"
+;    child:
+;      schedule: "[{ Stmt_for_body3[i0, i1] -> [(i0)] }, { Stmt_for_body3[i0, i1] -> [(i1)] }]"
+;      permutable: 1
+;      coincident: [ 1, 1 ]
+;  - filter: "{ Stmt_for_body3_last[i0, i1] }"
+;    child:
+;      schedule: "[{ Stmt_for_body3_last[i0, i1] -> [(i0)] }, { Stmt_for_body3_last[i0, i1] -> [(i1)] }]"
+;      permutable: 1
+;      coincident: [ 1, 1 ]
+;  - filter: "{ Stmt_for_body8[i0, i1, i2] }"
+;    child:
+;      schedule: "[{ Stmt_for_body8[i0, i1, i2] -> [(i0)] },
+;                  { Stmt_for_body8[i0, i1, i2] -> [(i1)] },
+;                  { Stmt_for_body8[i0, i1, i2] -> [(i2)] }]"
+;      permutable: 1
+;      coincident: [ 1, 1, 0 ]
+;
+; CHECK: The tensor contraction pattern was detected
 ; CHECK: The matrix multiplication pattern was detected
 ;
 
Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll
===================================================================
--- /dev/null
+++ polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll
@@ -0,0 +1,108 @@
+; RUN: opt %loadPolly -polly-delicm -polly-simplify -polly-opt-isl \
+; RUN: -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+;
+; Check that the pattern matching detects the tensor contraction pattern
+; after a full run of -polly-delicm. This test case generates the following
+; schedule, which contans two band nodes. Without DeLICM two statement are
+; generated.
+;
+; domain: "{ Stmt5[i0, i1, i2, i3, i4, i5] : 0 <= i0 <= 31 and 0 <= i1 <= 31 and
+;                                            0 <= i2 <= 31 and 0 <= i3 <= 31 and
+;                                            0 <= i4 <= 31 and 0 <= i5 <= 31 }"
+; child:
+;   schedule: "[{ Stmt5[i0, i1, i2, i3, i4, i5] -> [(i0)] },
+;               { Stmt5[i0, i1, i2, i3, i4, i5] -> [(i1)] },
+;               { Stmt5[i0, i1, i2, i3, i4, i5] -> [(i2)] },
+;               { Stmt5[i0, i1, i2, i3, i4, i5] -> [(i4)] },
+;               { Stmt5[i0, i1, i2, i3, i4, i5] -> [(i3)] }]"
+;   permutable: 1
+;   coincident: [ 1, 1, 1, 1, 0 ]
+;   child:
+;     schedule: "[{ Stmt5[i0, i1, i2, i3, i4, i5] -> [(i5)] }]"
+;     permutable: 1
+;     child:
+;       leaf
+;
+;   for (i = 0; i < 32; i++)
+;     for (j = 0; j < 32; j++)
+;       for (k = 0; k < 32; ++k)
+;         for (l = 0; l < 32; ++l)
+;           for (w = 0; w < 32; ++w)
+;             for (q = 0; q < 32; ++q)
+;               C[i][j][k][w] += A[i][l][j][q] * B[q][w][l][k];
+;
+; CHECK: The tensor contraction pattern was detected
+;
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal fastcc void @kernel_tc([32 x [32 x [32 x double]]]* nocapture %C, [32 x [32 x [32 x double]]]* nocapture readonly %A, [32 x [32 x [32 x double]]]* nocapture readonly %B) {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc50, %entry
+  %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for.inc50 ]
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.inc47, %for.cond1.preheader
+  %indvars.iv16 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next17, %for.inc47 ]
+  br label %for.cond7.preheader
+
+for.cond7.preheader:                              ; preds = %for.inc44, %for.cond4.preheader
+  %indvars.iv13 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next14, %for.inc44 ]
+  br label %for.cond10.preheader
+
+for.cond10.preheader:                             ; preds = %for.inc41, %for.cond7.preheader
+  %indvars.iv10 = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next11, %for.inc41 ]
+  br label %for.cond13.preheader
+
+for.cond13.preheader:                             ; preds = %for.inc38, %for.cond10.preheader
+  %indvars.iv7 = phi i64 [ 0, %for.cond10.preheader ], [ %indvars.iv.next8, %for.inc38 ]
+  %arrayidx37 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %C, i64 %indvars.iv19, i64 %indvars.iv16, i64 %indvars.iv13, i64 %indvars.iv7
+  %.pre = load double, double* %arrayidx37, align 8
+  br label %for.body15
+
+for.body15:                                       ; preds = %for.body15, %for.cond13.preheader
+  %i = phi double [ %.pre, %for.cond13.preheader ], [ %add, %for.body15 ]
+  %indvars.iv = phi i64 [ 0, %for.cond13.preheader ], [ %indvars.iv.next, %for.body15 ]
+  %arrayidx21 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %A, i64 %indvars.iv19, i64 %indvars.iv10, i64 %indvars.iv16, i64 %indvars.iv
+  %i1 = load double, double* %arrayidx21, align 8
+  %arrayidx29 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %B, i64 %indvars.iv, i64 %indvars.iv7, i64 %indvars.iv10, i64 %indvars.iv13
+  %i2 = load double, double* %arrayidx29, align 8
+  %mul = fmul fast double %i2, %i1
+  %add = fadd fast double %i, %mul
+  store double %add, double* %arrayidx37, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 32
+  br i1 %exitcond.not, label %for.inc38, label %for.body15
+
+for.inc38:                                        ; preds = %for.body15
+  %indvars.iv.next8 = add nuw nsw i64 %indvars.iv7, 1
+  %exitcond9.not = icmp eq i64 %indvars.iv.next8, 32
+  br i1 %exitcond9.not, label %for.inc41, label %for.cond13.preheader
+
+for.inc41:                                        ; preds = %for.inc38
+  %indvars.iv.next11 = add nuw nsw i64 %indvars.iv10, 1
+  %exitcond12.not = icmp eq i64 %indvars.iv.next11, 32
+  br i1 %exitcond12.not, label %for.inc44, label %for.cond10.preheader
+
+for.inc44:                                        ; preds = %for.inc41
+  %indvars.iv.next14 = add nuw nsw i64 %indvars.iv13, 1
+  %exitcond15.not = icmp eq i64 %indvars.iv.next14, 32
+  br i1 %exitcond15.not, label %for.inc47, label %for.cond7.preheader
+
+for.inc47:                                        ; preds = %for.inc44
+  %indvars.iv.next17 = add nuw nsw i64 %indvars.iv16, 1
+  %exitcond18.not = icmp eq i64 %indvars.iv.next17, 32
+  br i1 %exitcond18.not, label %for.inc50, label %for.cond4.preheader
+
+for.inc50:                                        ; preds = %for.inc47
+  %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
+  %exitcond21.not = icmp eq i64 %indvars.iv.next20, 32
+  br i1 %exitcond21.not, label %for.end52, label %for.cond1.preheader
+
+for.end52:                                        ; preds = %for.inc50
+  ret void
+}
Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll
===================================================================
--- polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll
+++ polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll
@@ -1,5 +1,5 @@
 ; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=false \
-; RUN: -debug < %s 2>&1| FileCheck %s
+; RUN: -debug -polly-tc-opt < %s 2>&1| FileCheck %s
 ; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -debug < %s 2>&1| FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS
 ; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -polly-ast-detect-parallel -polly-ast -analyze < %s | FileCheck %s --check-prefix=PARALLEL-AST
 ; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -stats -disable-output < %s 2>&1| FileCheck %s --check-prefix=STATS -match-full-lines
@@ -15,6 +15,7 @@
 ;        }
 ;
 ; CHECK-NOT: The matrix multiplication pattern was detected
+; CHECK-NOT: The tensor contraction pattern was detected
 ; PATTERN-MATCHING-OPTS: The matrix multiplication pattern was detected
 ; PARALLEL-AST: #pragma known-parallel
 ; PARALLEL-AST: #pragma known-parallel
Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll
===================================================================
--- polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll
+++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll
@@ -8,7 +8,8 @@
 ; RUN: -polly-target-1st-cache-level-size=32768 \
 ; RUN: -polly-target-vector-register-bitwidth=256 \
 ; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: -polly-opt-isl -debug < %s 2>&1 \
+; RUN: -polly-opt-isl -debug \
+; RUN: -polly-tc-opt=true < %s 2>&1 \
 ; RUN: | FileCheck %s
 ; REQUIRES: asserts
 ;
@@ -16,6 +17,7 @@
 ; in case scalar memory accesses were replaced by accesses to newly created
 ; arrays.
 ;
+; CHECK: The tensor contraction pattern was detected
 ; CHECK: The matrix multiplication pattern was detected
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll
===================================================================
--- polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll
+++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll
@@ -1,5 +1,6 @@
 ; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
-; RUN: -debug-only=polly-opt-isl -disable-output < %s 2>&1 | FileCheck %s
+; RUN: -debug-only=polly-opt-isl -disable-output \
+; RUN: -polly-tc-opt=true < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ;    for (i = 0; i < _PB_NI; i++)
@@ -14,6 +15,7 @@
 ;      }
 ;
 ; CHECK-NOT: The matrix multiplication pattern was detected
+; CHECK-NOT: The tensor contraction pattern was detected
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll
===================================================================
--- /dev/null
+++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll
@@ -0,0 +1,64 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug < %s 2>&1| FileCheck %s
+; REQUIRES: asserts
+;
+;      for (i = 0; i < 1024; i++)
+;        for (j = 0; j < 1024; j++)
+;          for (l = 0; l < 64; ++l)
+;            for (w = 0; w < 64; ++w)
+;              C[i][j] += A[i][l][w] * B[w][j][l];
+;
+; CHECK: The tensor contraction pattern was detected
+;
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal void @kernel_tc(i32 %ni, i32 %nj, i32 %nl, i32 %nq, i32 %nw, double %alpha, double %beta, [1024 x double]* %C, [64 x [64 x double]]* %A, [1024 x [64 x double]]* %B) {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc30, %entry
+  %indvars.iv43 = phi i64 [ 0, %entry ], [ %indvars.iv.next44, %for.inc30 ]
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.inc27, %for.cond1.preheader
+  %indvars.iv40 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next41, %for.inc27 ]
+  br label %for.cond7.preheader
+
+for.cond7.preheader:                              ; preds = %for.inc24, %for.cond4.preheader
+  %indvars.iv37 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next38, %for.inc24 ]
+  br label %for.body9
+
+for.body9:                                        ; preds = %for.body9, %for.cond7.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next, %for.body9 ]
+  %arrayidx13 = getelementptr inbounds [64 x [64 x double]], [64 x [64 x double]]* %A, i64 %indvars.iv43, i64 %indvars.iv37, i64 %indvars.iv
+  %i = load double, double* %arrayidx13, align 8
+  %arrayidx19 = getelementptr inbounds [1024 x [64 x double]], [1024 x [64 x double]]* %B, i64 %indvars.iv, i64 %indvars.iv40, i64 %indvars.iv37
+  %i1 = load double, double* %arrayidx19, align 8
+  %mul = fmul fast double %i1, %i
+  %arrayidx23 = getelementptr inbounds [1024 x double], [1024 x double]* %C, i64 %indvars.iv43, i64 %indvars.iv40
+  %i2 = load double, double* %arrayidx23, align 8
+  %add = fadd fast double %i2, %mul
+  store double %add, double* %arrayidx23, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 64
+  br i1 %exitcond, label %for.body9, label %for.inc24
+
+for.inc24:                                        ; preds = %for.body9
+  %indvars.iv.next38 = add nuw nsw i64 %indvars.iv37, 1
+  %exitcond39 = icmp ne i64 %indvars.iv.next38, 64
+  br i1 %exitcond39, label %for.cond7.preheader, label %for.inc27
+
+for.inc27:                                        ; preds = %for.inc24
+  %indvars.iv.next41 = add nuw nsw i64 %indvars.iv40, 1
+  %exitcond42 = icmp ne i64 %indvars.iv.next41, 1024
+  br i1 %exitcond42, label %for.cond4.preheader, label %for.inc30
+
+for.inc30:                                        ; preds = %for.inc27
+  %indvars.iv.next44 = add nuw nsw i64 %indvars.iv43, 1
+  %exitcond45 = icmp ne i64 %indvars.iv.next44, 1024
+  br i1 %exitcond45, label %for.cond1.preheader, label %for.end32
+
+for.end32:                                        ; preds = %for.inc30
+  ret void
+}
Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll
===================================================================
--- /dev/null
+++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll
@@ -0,0 +1,64 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug < %s 2>&1| FileCheck %s
+; REQUIRES: asserts
+;
+;      for (i = 0; i < 32; i++)
+;        for (j = 0; j < 1024; j++)
+;          for (k = 0; k < 32; ++k)
+;            for (l = 0; l < 1024; ++l)
+;              C[i][j][k] += A[i][k][l] * B[l][j];
+;
+; CHECK: The tensor contraction pattern was detected
+;
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal void @kernel_tc(i32 %ni, i32 %nj, i32 %nk, i32 %nl, double %alpha, double %beta, [1024 x [32 x double]]* %C, [32 x [1024 x double]]* %A, [1024 x double]* %B) {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc30, %entry
+  %indvars.iv43 = phi i64 [ 0, %entry ], [ %indvars.iv.next44, %for.inc30 ]
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.inc27, %for.cond1.preheader
+  %indvars.iv40 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next41, %for.inc27 ]
+  br label %for.cond7.preheader
+
+for.cond7.preheader:                              ; preds = %for.inc24, %for.cond4.preheader
+  %indvars.iv37 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next38, %for.inc24 ]
+  br label %for.body9
+
+for.body9:                                        ; preds = %for.body9, %for.cond7.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next, %for.body9 ]
+  %arrayidx13 = getelementptr inbounds [32 x [1024 x double]], [32 x [1024 x double]]* %A, i64 %indvars.iv43, i64 %indvars.iv37, i64 %indvars.iv
+  %i = load double, double* %arrayidx13, align 8
+  %arrayidx17 = getelementptr inbounds [1024 x double], [1024 x double]* %B, i64 %indvars.iv, i64 %indvars.iv40
+  %i1 = load double, double* %arrayidx17, align 8
+  %mul = fmul fast double %i1, %i
+  %arrayidx23 = getelementptr inbounds [1024 x [32 x double]], [1024 x [32 x double]]* %C, i64 %indvars.iv43, i64 %indvars.iv40, i64 %indvars.iv37
+  %i2 = load double, double* %arrayidx23, align 8
+  %add = fadd fast double %i2, %mul
+  store double %add, double* %arrayidx23, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.body9, label %for.inc24
+
+for.inc24:                                        ; preds = %for.body9
+  %indvars.iv.next38 = add nuw nsw i64 %indvars.iv37, 1
+  %exitcond39 = icmp ne i64 %indvars.iv.next38, 32
+  br i1 %exitcond39, label %for.cond7.preheader, label %for.inc27
+
+for.inc27:                                        ; preds = %for.inc24
+  %indvars.iv.next41 = add nuw nsw i64 %indvars.iv40, 1
+  %exitcond42 = icmp ne i64 %indvars.iv.next41, 1024
+  br i1 %exitcond42, label %for.cond4.preheader, label %for.inc30
+
+for.inc30:                                        ; preds = %for.inc27
+  %indvars.iv.next44 = add nuw nsw i64 %indvars.iv43, 1
+  %exitcond45 = icmp ne i64 %indvars.iv.next44, 32
+  br i1 %exitcond45, label %for.cond1.preheader, label %for.end32
+
+for.end32:                                        ; preds = %for.inc30
+  ret void
+}
Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll
===================================================================
--- /dev/null
+++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll
@@ -0,0 +1,84 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug < %s 2>&1| FileCheck %s
+; REQUIRES: asserts
+;
+;      for (i = 0; i < 32; i++)
+;        for (j = 0; j < 32; j++)
+;          for (k = 0; k < 32; ++k)
+;            for (l = 0; l < 32; ++l)
+;              for (w = 0; w < 32; ++w)
+;                for (q = 0; q < 32; ++q)
+;                  C[i][j][k][w] += A[i][l][j][q] * B[q][w][l][k];
+;
+; CHECK: The tensor contraction pattern was detected
+;
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal void @kernel_tc(i32 %ni, i32 %nj, i32 %nk, i32 %nl, i32 %nq, i32 %nw, double %alpha, double %beta, [32 x [32 x [32 x double]]]* %C, [32 x [32 x [32 x double]]]* %A, [32 x [32 x [32 x double]]]* %B) {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc50, %entry
+  %indvars.iv71 = phi i64 [ 0, %entry ], [ %indvars.iv.next72, %for.inc50 ]
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.inc47, %for.cond1.preheader
+  %indvars.iv68 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next69, %for.inc47 ]
+  br label %for.cond7.preheader
+
+for.cond7.preheader:                              ; preds = %for.inc44, %for.cond4.preheader
+  %indvars.iv65 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next66, %for.inc44 ]
+  br label %for.cond10.preheader
+
+for.cond10.preheader:                             ; preds = %for.inc41, %for.cond7.preheader
+  %indvars.iv62 = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next63, %for.inc41 ]
+  br label %for.cond13.preheader
+
+for.cond13.preheader:                             ; preds = %for.inc38, %for.cond10.preheader
+  %indvars.iv59 = phi i64 [ 0, %for.cond10.preheader ], [ %indvars.iv.next60, %for.inc38 ]
+  br label %for.body15
+
+for.body15:                                       ; preds = %for.body15, %for.cond13.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond13.preheader ], [ %indvars.iv.next, %for.body15 ]
+  %arrayidx21 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %A, i64 %indvars.iv71, i64 %indvars.iv62, i64 %indvars.iv68, i64 %indvars.iv
+  %i = load double, double* %arrayidx21, align 8
+  %arrayidx29 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %B, i64 %indvars.iv, i64 %indvars.iv59, i64 %indvars.iv62, i64 %indvars.iv65
+  %i1 = load double, double* %arrayidx29, align 8
+  %mul = fmul fast double %i1, %i
+  %arrayidx37 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %C, i64 %indvars.iv71, i64 %indvars.iv68, i64 %indvars.iv65, i64 %indvars.iv59
+  %i2 = load double, double* %arrayidx37, align 8
+  %add = fadd fast double %i2, %mul
+  store double %add, double* %arrayidx37, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 32
+  br i1 %exitcond, label %for.body15, label %for.inc38
+
+for.inc38:                                        ; preds = %for.body15
+  %indvars.iv.next60 = add nuw nsw i64 %indvars.iv59, 1
+  %exitcond61 = icmp ne i64 %indvars.iv.next60, 32
+  br i1 %exitcond61, label %for.cond13.preheader, label %for.inc41
+
+for.inc41:                                        ; preds = %for.inc38
+  %indvars.iv.next63 = add nuw nsw i64 %indvars.iv62, 1
+  %exitcond64 = icmp ne i64 %indvars.iv.next63, 32
+  br i1 %exitcond64, label %for.cond10.preheader, label %for.inc44
+
+for.inc44:                                        ; preds = %for.inc41
+  %indvars.iv.next66 = add nuw nsw i64 %indvars.iv65, 1
+  %exitcond67 = icmp ne i64 %indvars.iv.next66, 32
+  br i1 %exitcond67, label %for.cond7.preheader, label %for.inc47
+
+for.inc47:                                        ; preds = %for.inc44
+  %indvars.iv.next69 = add nuw nsw i64 %indvars.iv68, 1
+  %exitcond70 = icmp ne i64 %indvars.iv.next69, 32
+  br i1 %exitcond70, label %for.cond4.preheader, label %for.inc50
+
+for.inc50:                                        ; preds = %for.inc47
+  %indvars.iv.next72 = add nuw nsw i64 %indvars.iv71, 1
+  %exitcond73 = icmp ne i64 %indvars.iv.next72, 32
+  br i1 %exitcond73, label %for.cond1.preheader, label %for.end52
+
+for.end52:                                        ; preds = %for.inc50
+  ret void
+}
Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll
===================================================================
--- /dev/null
+++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll
@@ -0,0 +1,84 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug < %s 2>&1| FileCheck %s
+; REQUIRES: asserts
+;
+;   for (i = 0; i < 8; i++)
+;     for (j = 0; j < 8; j++)
+;       for (k = 0; k < 4; ++k)
+;         for (l = 0; l < 1024; ++l)
+;           for (w = 0; w < 1024; ++w)
+;             for (q = 0; q < 4; ++q)
+;               C[i][j][k][w][q] += A[q][k][j][l][i] * B[l][w];
+;
+; CHECK: The tensor contraction pattern was detected
+;
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal void @kernel_tc([8 x [4 x [1024 x [4 x double]]]]* %C, [4 x [8 x [1024 x [8 x double]]]]* %A, [1024 x double]* %B) {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc50, %entry
+  %indvars.iv71 = phi i64 [ 0, %entry ], [ %indvars.iv.next72, %for.inc50 ]
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.inc47, %for.cond1.preheader
+  %indvars.iv68 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next69, %for.inc47 ]
+  br label %for.cond7.preheader
+
+for.cond7.preheader:                              ; preds = %for.inc44, %for.cond4.preheader
+  %indvars.iv65 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next66, %for.inc44 ]
+  br label %for.cond10.preheader
+
+for.cond10.preheader:                             ; preds = %for.inc41, %for.cond7.preheader
+  %indvars.iv62 = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next63, %for.inc41 ]
+  br label %for.cond13.preheader
+
+for.cond13.preheader:                             ; preds = %for.inc38, %for.cond10.preheader
+  %indvars.iv59 = phi i64 [ 0, %for.cond10.preheader ], [ %indvars.iv.next60, %for.inc38 ]
+  br label %for.body15
+
+for.body15:                                       ; preds = %for.body15, %for.cond13.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond13.preheader ], [ %indvars.iv.next, %for.body15 ]
+  %arrayidx23 = getelementptr inbounds [4 x [8 x [1024 x [8 x double]]]], [4 x [8 x [1024 x [8 x double]]]]* %A, i64 %indvars.iv, i64 %indvars.iv65, i64 %indvars.iv68, i64 %indvars.iv62, i64 %indvars.iv71
+  %i = load double, double* %arrayidx23, align 8
+  %arrayidx27 = getelementptr inbounds [1024 x double], [1024 x double]* %B, i64 %indvars.iv62, i64 %indvars.iv59
+  %i1 = load double, double* %arrayidx27, align 8
+  %mul = fmul fast double %i1, %i
+  %arrayidx37 = getelementptr inbounds [8 x [4 x [1024 x [4 x double]]]], [8 x [4 x [1024 x [4 x double]]]]* %C, i64 %indvars.iv71, i64 %indvars.iv68, i64 %indvars.iv65, i64 %indvars.iv59, i64 %indvars.iv
+  %i2 = load double, double* %arrayidx37, align 8
+  %add = fadd fast double %i2, %mul
+  store double %add, double* %arrayidx37, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 4
+  br i1 %exitcond, label %for.body15, label %for.inc38
+
+for.inc38:                                        ; preds = %for.body15
+  %indvars.iv.next60 = add nuw nsw i64 %indvars.iv59, 1
+  %exitcond61 = icmp ne i64 %indvars.iv.next60, 1024
+  br i1 %exitcond61, label %for.cond13.preheader, label %for.inc41
+
+for.inc41:                                        ; preds = %for.inc38
+  %indvars.iv.next63 = add nuw nsw i64 %indvars.iv62, 1
+  %exitcond64 = icmp ne i64 %indvars.iv.next63, 1024
+  br i1 %exitcond64, label %for.cond10.preheader, label %for.inc44
+
+for.inc44:                                        ; preds = %for.inc41
+  %indvars.iv.next66 = add nuw nsw i64 %indvars.iv65, 1
+  %exitcond67 = icmp ne i64 %indvars.iv.next66, 4
+  br i1 %exitcond67, label %for.cond7.preheader, label %for.inc47
+
+for.inc47:                                        ; preds = %for.inc44
+  %indvars.iv.next69 = add nuw nsw i64 %indvars.iv68, 1
+  %exitcond70 = icmp ne i64 %indvars.iv.next69, 8
+  br i1 %exitcond70, label %for.cond4.preheader, label %for.inc50
+
+for.inc50:                                        ; preds = %for.inc47
+  %indvars.iv.next72 = add nuw nsw i64 %indvars.iv71, 1
+  %exitcond73 = icmp ne i64 %indvars.iv.next72, 8
+  br i1 %exitcond73, label %for.cond1.preheader, label %for.end52
+
+for.end52:                                        ; preds = %for.inc50
+  ret void
+}
Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll
===================================================================
--- polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll
+++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -debug < %s 2>&1 | FileCheck %s
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ;    /* C := alpha*A*B + beta*C */
@@ -15,6 +16,7 @@
 ; after the interchanging of loops.
 ;
 ; CHECK-NOT: The matrix multiplication pattern was detected
+; CHECK-NOT: The tensor contraction pattern was detected
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll
===================================================================
--- /dev/null
+++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll
@@ -0,0 +1,94 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug < %s 2>&1| FileCheck %s
+; REQUIRES: asserts
+;
+;   for (i = 0; i < 16; i++)
+;     for (j = 0; j < 16; j++)
+;       for (k = 0; k < 8; ++k)
+;         for (l = 0; l < 1024; ++l)
+;           for (w = 0; w < 8; ++w)
+;             for (q = 0; q < 8; ++q)
+;               for (x = 0; x < 8; ++x)
+;                 C[i][j][k][w][q][x] += A[l][x][j][k] * B[w][q][l][i];
+;
+; CHECK: The tensor contraction pattern was detected
+;
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal void @kernel_tc([16 x [8 x [8 x [8 x [8 x double]]]]]* %C, [8 x [16 x [8 x double]]]* %A, [8 x [1024 x [16 x double]]]* %B) {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc60, %entry
+  %indvars.iv85 = phi i64 [ 0, %entry ], [ %indvars.iv.next86, %for.inc60 ]
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.inc57, %for.cond1.preheader
+  %indvars.iv82 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next83, %for.inc57 ]
+  br label %for.cond7.preheader
+
+for.cond7.preheader:                              ; preds = %for.inc54, %for.cond4.preheader
+  %indvars.iv79 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next80, %for.inc54 ]
+  br label %for.cond10.preheader
+
+for.cond10.preheader:                             ; preds = %for.inc51, %for.cond7.preheader
+  %indvars.iv76 = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next77, %for.inc51 ]
+  br label %for.cond13.preheader
+
+for.cond13.preheader:                             ; preds = %for.inc48, %for.cond10.preheader
+  %indvars.iv73 = phi i64 [ 0, %for.cond10.preheader ], [ %indvars.iv.next74, %for.inc48 ]
+  br label %for.cond16.preheader
+
+for.cond16.preheader:                             ; preds = %for.inc45, %for.cond13.preheader
+  %indvars.iv70 = phi i64 [ 0, %for.cond13.preheader ], [ %indvars.iv.next71, %for.inc45 ]
+  br label %for.body18
+
+for.body18:                                       ; preds = %for.body18, %for.cond16.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond16.preheader ], [ %indvars.iv.next, %for.body18 ]
+  %arrayidx24 = getelementptr inbounds [8 x [16 x [8 x double]]], [8 x [16 x [8 x double]]]* %A, i64 %indvars.iv76, i64 %indvars.iv, i64 %indvars.iv82, i64 %indvars.iv79
+  %i = load double, double* %arrayidx24, align 8
+  %arrayidx32 = getelementptr inbounds [8 x [1024 x [16 x double]]], [8 x [1024 x [16 x double]]]* %B, i64 %indvars.iv73, i64 %indvars.iv70, i64 %indvars.iv76, i64 %indvars.iv85
+  %i1 = load double, double* %arrayidx32, align 8
+  %mul = fmul fast double %i1, %i
+  %arrayidx44 = getelementptr inbounds [16 x [8 x [8 x [8 x [8 x double]]]]], [16 x [8 x [8 x [8 x [8 x double]]]]]* %C, i64 %indvars.iv85, i64 %indvars.iv82, i64 %indvars.iv79, i64 %indvars.iv73, i64 %indvars.iv70, i64 %indvars.iv
+  %i2 = load double, double* %arrayidx44, align 8
+  %add = fadd fast double %i2, %mul
+  store double %add, double* %arrayidx44, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 8
+  br i1 %exitcond, label %for.body18, label %for.inc45
+
+for.inc45:                                        ; preds = %for.body18
+  %indvars.iv.next71 = add nuw nsw i64 %indvars.iv70, 1
+  %exitcond72 = icmp ne i64 %indvars.iv.next71, 8
+  br i1 %exitcond72, label %for.cond16.preheader, label %for.inc48
+
+for.inc48:                                        ; preds = %for.inc45
+  %indvars.iv.next74 = add nuw nsw i64 %indvars.iv73, 1
+  %exitcond75 = icmp ne i64 %indvars.iv.next74, 8
+  br i1 %exitcond75, label %for.cond13.preheader, label %for.inc51
+
+for.inc51:                                        ; preds = %for.inc48
+  %indvars.iv.next77 = add nuw nsw i64 %indvars.iv76, 1
+  %exitcond78 = icmp ne i64 %indvars.iv.next77, 1024
+  br i1 %exitcond78, label %for.cond10.preheader, label %for.inc54
+
+for.inc54:                                        ; preds = %for.inc51
+  %indvars.iv.next80 = add nuw nsw i64 %indvars.iv79, 1
+  %exitcond81 = icmp ne i64 %indvars.iv.next80, 8
+  br i1 %exitcond81, label %for.cond7.preheader, label %for.inc57
+
+for.inc57:                                        ; preds = %for.inc54
+  %indvars.iv.next83 = add nuw nsw i64 %indvars.iv82, 1
+  %exitcond84 = icmp ne i64 %indvars.iv.next83, 16
+  br i1 %exitcond84, label %for.cond4.preheader, label %for.inc60
+
+for.inc60:                                        ; preds = %for.inc57
+  %indvars.iv.next86 = add nuw nsw i64 %indvars.iv85, 1
+  %exitcond87 = icmp ne i64 %indvars.iv.next86, 16
+  br i1 %exitcond87, label %for.cond1.preheader, label %for.end62
+
+for.end62:                                        ; preds = %for.inc60
+  ret void
+}
Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll
===================================================================
--- polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll
+++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll
@@ -1,12 +1,13 @@
 ; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
-; RUN: -debug < %s 2>&1| FileCheck %s
+; RUN: -debug -polly-tc-opt=true < %s 2>&1| FileCheck %s
 ; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-target-throughput-vector-fma=1 \
 ; RUN: -polly-target-latency-vector-fma=8 \
 ; RUN: -polly-target-1st-cache-level-size=32768 \
 ; RUN: -polly-target-vector-register-bitwidth=256 \
 ; RUN: -polly-target-2nd-cache-level-size=262144 -polly-ast \
-; RUN: -analyze < %s | FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS
+; RUN: -analyze -polly-tc-opt=true < %s |  \
+; RUN: FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS
 ; REQUIRES: asserts
 ;
 ;    C := A * B + C
@@ -20,6 +21,7 @@
 ;        for (j = 0; j < _PB_NJ; j++)
 ;	   C[i][j] += A[i][k] * B[k][j];
 ;
+; CHECK: The tensor contraction pattern was detected
 ; CHECK: The matrix multiplication pattern was detected
 ;
 ; PATTERN-MATCHING-OPTS:    // 1st level tiling - Tiles