Index: polly/lib/Transform/MatmulOptimizer.cpp =================================================================== --- polly/lib/Transform/MatmulOptimizer.cpp +++ polly/lib/Transform/MatmulOptimizer.cpp @@ -796,6 +796,46 @@ return Node.graft_before(NewNode); } +/// Create the copy statement and redirect access +/// +/// Create a new statement, which copies data from the memory access location +/// specified by the access relation @p AccRel to a location +/// specified by @p AccRelPacked. Subsequently, change memory access +/// relation @p AccRel of @p Stmt to @p AccRelPacked. +/// +/// @param Stmt The statement with an access @p AccRel +/// to be redirected. +/// @param AccRel The original access relation of @p Stmt. +/// @param AccRelPacked A new access relation. +/// @param OuterDomainMap The copy statement domain. +/// @param MapOldIndVar The relation, which maps original induction variables +/// to the ones, which are produced by schedule +/// transformations. +/// @return A new copy statement. +static ScopStmt *addScopStmt(ScopStmt *Stmt, isl::map AccRel, + isl::map AccRelPacked, isl::map OuterDomainMap, + isl::map MapOldIndVar) { + // { Memref[] -> Packed[] } + isl::map PackedTranslator = AccRelPacked.apply_domain(AccRel); + isl::set Domain = Stmt->getDomain(); + // Prevent the unbound optimum + isl::map CopyFrom = MapOldIndVar.intersect_domain(Domain); + // { Scatter[] -> MemrefA[] } + CopyFrom = CopyFrom.reverse().apply_range(AccRel); + // { Scatter[] -> CopyStmt[] } + isl::map DomainTranslator = OuterDomainMap.range_product(CopyFrom); + // { CopyStmt[] } + isl::set CopyDomain = DomainTranslator.range(); + + // Translate the access relations to the new domain. + // { CopyStmt[] -> MemrefA[] } + CopyFrom = CopyFrom.apply_domain(DomainTranslator); + // { CopyStmt[] -> PackedA[] } + isl::map CopyTo = CopyFrom.apply_range(PackedTranslator); + + return Stmt->getParent()->addScopStmt(CopyFrom, CopyTo, CopyDomain); +} + static isl::schedule_node optimizePackedB(isl::schedule_node Node, ScopStmt *Stmt, isl::map MapOldIndVar, MicroKernelParamsTy MicroParams, @@ -814,23 +854,29 @@ // Compute the access relation for copying from B to PackedB. isl::map AccRelB = MMI.B->getLatestAccessRelation(); + unsigned AccRelBDimOutNum = unsignedFromIslSize(AccRelB.dim(isl::dim::out)); isl::map AccRelPackedB = getMatMulAccRel(MapOldIndVar, 3, 7); AccRelPackedB = AccRelPackedB.set_tuple_id(isl::dim::out, PackedB->getBasePtrId()); - // Create the copy statement and redirect access. - ScopStmt *CopyStmt = S->addScopStmt(AccRelB, AccRelPackedB, Domain); + // Compute the domain for the copy statement. + // Construct the copy statement domain out of the 2 outermost scatter + // dimensions (to match the 2 band nodes surrounding the extension node) and + // the array elements to copy (one statement instance per array element). + // { Scatter[] } + isl::set ScatterDomain = MapOldIndVar.intersect_domain(Domain).range(); + isl::map OuterDomainMap = + makeIdentityMap(ScatterDomain, true).project_out(isl::dim::out, 2, 7); + + ScopStmt *CopyStmt = + addScopStmt(Stmt, AccRelB, AccRelPackedB, OuterDomainMap, MapOldIndVar); MMI.B->setNewAccessRelation(AccRelPackedB); - unsigned Dim = unsignedFromIslSize(MapOldIndVar.range_tuple_dim()); - assert(Dim >= 2); - // Insert into the schedule tree. - isl::map ExtMap = MapOldIndVar.project_out(isl::dim::out, 2, Dim - 2); - ExtMap = ExtMap.reverse(); - ExtMap = ExtMap.fix_si(isl::dim::out, MMI.i, 0); - ExtMap = ExtMap.intersect_range(Domain); - ExtMap = ExtMap.set_tuple_id(isl::dim::out, CopyStmt->getDomainId()); - return createExtensionNode(Node, ExtMap); + // { Scatter[] -> CopyStmt[] } + isl::map ExtScatterCopy = makeIdentityMap(CopyStmt->getDomain(), true); + ExtScatterCopy = + ExtScatterCopy.project_out(isl::dim::in, 2, AccRelBDimOutNum); + return createExtensionNode(Node, ExtScatterCopy); } static isl::schedule_node optimizePackedA(isl::schedule_node Node, ScopStmt *, @@ -853,11 +899,10 @@ // Compute the access relation for copying from A to PackedA. isl::map AccRelA = MMI.A->getLatestAccessRelation(); + unsigned AccRelADimOutNum = unsignedFromIslSize(AccRelA.dim(isl::dim::out)); isl::map AccRelPackedA = getMatMulAccRel(MapOldIndVar, 4, 6); AccRelPackedA = AccRelPackedA.set_tuple_id(isl::dim::out, PackedA->getBasePtrId()); - // { MemrefA[] -> PackedA[] } - isl::map PackedATranslator = AccRelPackedA.apply_domain(AccRelA); // Compute the domain for the copy statement. // Construct the copy statement domain out of the 3 outermost scatter @@ -868,28 +913,17 @@ // { Scatter[] -> OutermostScatter[] } isl::map OuterDomainMap = makeIdentityMap(ScatterDomain, true).project_out(isl::dim::out, 3, 6); - // { Scatter[] -> MemrefA[] } - isl::map CopyFrom = MapOldIndVar.reverse().apply_range(AccRelA); - // { Scatter[] -> CopyStmt[] } - isl::map DomainTranslator = OuterDomainMap.range_product(CopyFrom); - // { CopyStmt[] } - isl::set CopyDomain = DomainTranslator.range(); - - // Translate the access relations to the new domain. - // { CopyStmt[] -> MemrefA[] } - CopyFrom = CopyFrom.apply_domain(DomainTranslator); - // { CopyStmt[] -> PackedA[] } - isl::map CopyTo = CopyFrom.apply_range(PackedATranslator); // Create the copy statement and redirect access. ScopStmt *CopyStmt = - Stmt->getParent()->addScopStmt(CopyFrom, CopyTo, CopyDomain); + addScopStmt(Stmt, AccRelA, AccRelPackedA, OuterDomainMap, MapOldIndVar); MMI.A->setNewAccessRelation(AccRelPackedA); // Insert into the schedule tree. // { Scatter[] -> CopyStmt[] } isl::map ExtScatterCopy = makeIdentityMap(CopyStmt->getDomain(), true); - ExtScatterCopy = ExtScatterCopy.project_out(isl::dim::in, 3, 2); + ExtScatterCopy = + ExtScatterCopy.project_out(isl::dim::in, 3, AccRelADimOutNum); return createExtensionNode(Node, ExtScatterCopy); } @@ -1818,6 +1852,253 @@ return false; } +/// Order the dimensions of operands of the tensor contraction. +/// +/// To improve the spatial locality of the data, we use a heuristic, which is +/// introduced in [1]. The heuristic logically reorders the tensor dimensions +/// and consists of the following steps: +/// +/// 1. Sort the dimensions in bundles I and J by increasing stride in +/// the tensor C. +/// 2. If tensor A contains the last dimension of C, swap A with B and I with J. +/// 3. Sort the dimensions in the bundle P by increasing stride in the tensor A. +/// +/// [1] - Devin Matthews. High-Performance Tensor Contraction without BLAS. +/// SIAM Journal on Scientific Computing, 40(1). 2016. DOI: 10.1137/16M108968X. +/// +/// @param TCI The information about the tensor contraction. +static void orderDimensions(TCInfoTy &TCI) { + // Sort the dimensions in bundles I and J by increasing stride in + // the tensor C + for (const int &Dimension : TCI.CDimensions) { + assert(TCI.I.count(Dimension) || TCI.J.count(Dimension)); + if (TCI.I.count(Dimension)) { + TCI.OrderedI.push_back(Dimension); + continue; + } + if (TCI.J.count(Dimension)) + TCI.OrderedJ.push_back(Dimension); + } + + // If tensor A contains the last dimension of C, swap A with B and I with J + if (TCI.I.count(TCI.CDimensions.back())) { + MemoryAccess *Access = TCI.A; + TCI.A = TCI.B; + TCI.B = Access; + TCI.I.swap(TCI.J); + TCI.ADimensions.swap(TCI.BDimensions); + TCI.OrderedI.swap(TCI.OrderedJ); + } + + // Sort the dimensions in the bundle P by increasing stride in the tensor A + for (const int &Dimension : TCI.ADimensions) + if (TCI.P.count(Dimension)) + TCI.OrderedP.push_back(Dimension); +} + +/// Logically map dimensions of a tensor to dimensions of an imaginary matrix +/// +/// flattenDimensions returns a new dimension, which is a linear combination of +/// @p Dimensions in order specified by @p DimensionsOrder. Specifically, a new +/// dimension J can be represented as +/// J = Dimensions[D0] A0 + … + Dimensions[D(N - 1)] * A(N - 1), +/// where Di = DimensionsOrder[i], +/// Ai = DimensionsSizes[i+1] * … * DimensionsSizes[N - 1]. +/// +/// Using the described one-to-one mapping, which is introduced in [1], the +/// opimization of the matrix multiplication of the corresponding imaginary +/// matrices can be performed for the tensor contraction. +/// +/// [1] - Devin Matthews. High-Performance Tensor Contraction without BLAS. +/// SIAM Journal on Scientific Computing, 40(1). 2016. DOI: 10.1137/16M108968X. +/// +/// @param Dimensions Dimensions of the tensor. +/// @param DimensionsOrder An order of dimensions in the linear combination. +/// @param DimensionsSizes Sizes of dimensions in the linear combination. +/// @return A new dimension. +static isl::union_pw_aff +flattenDimensions(isl::multi_union_pw_aff Dimensions, + const SmallVector &DimensionsOrder, + const SmallVector &DimensionsSizes) { + isl::union_pw_aff NewDimension; + isl::ctx Ctx = Dimensions.ctx(); + for (const int &Dim : DimensionsOrder) { + // Handle the first element in the linear combination of dimensions + if (NewDimension.is_null()) { + NewDimension = isl::manage( + isl_multi_union_pw_aff_get_union_pw_aff(Dimensions.get(), Dim)); + continue; + } + + // Update the linear combination + int DimSize = DimensionsSizes[Dim]; + if (DimSize > 0) + NewDimension = isl::manage(isl_union_pw_aff_scale_val( + NewDimension.release(), isl::val(Ctx, DimSize).release())); + else + NewDimension = isl::manage(isl_union_pw_aff_scale_val( + NewDimension.release(), isl::val(Ctx, 1).release())); + + // Add a new element to the linear combination + NewDimension = NewDimension.add(isl::manage( + isl_multi_union_pw_aff_get_union_pw_aff(Dimensions.get(), Dim))); + } + return NewDimension; +} + +/// Logically represent tensor contraction operands as matrix multiplication +/// operands. +/// +/// Convert the information about tensor contraction operands into the +/// information about matrix multiplication operands. +/// +/// @param TCI The information about the tensor contraction. +/// @return The information about matrix multiplication operands. +static MatMulInfoTy convertIntoMatMulInfo(const TCInfoTy &TCI) { + MatMulInfoTy NewMatMulInfo; + NewMatMulInfo.A = TCI.A; + NewMatMulInfo.B = TCI.B; + NewMatMulInfo.ReadFromC = TCI.ReadFromC; + NewMatMulInfo.WriteToC = TCI.WriteToC; + return NewMatMulInfo; +} + +/// Logically cast the schedule tree, which represents the tensor contraction, +/// into the schedule tree, which represents the matrix multiplication +/// +/// convertIntoMMMSchedule maps dimensions of tensors to dimensions of imaginary +/// matrices, which are operands of the matrix multiplication. Using the +/// described one-to-one mapping, the opimization of the matrix +/// multiplication of the corresponding imaginary matrices can be performed for +/// the tensor contraction. +/// +/// @param Node The schedule node to be optimized. +/// @param TTI Target Transform Info. +/// @param TCI The information about the tensor contraction. +/// @return The optimized schedule node. +static isl::schedule_node +convertIntoMMMSchedule(isl::schedule_node Node, + const llvm::TargetTransformInfo *TTI, + const TCInfoTy &TCI) { + assert(TTI && "The target transform info should be provided."); + + // Obtain dimensions of tensors + isl::union_set Domain = Node.get_universe_domain(); + isl::union_pw_multi_aff IdentityUnionPwAff = + Domain.identity_union_pw_multi_aff(); + auto Dimensions = isl::multi_union_pw_aff(IdentityUnionPwAff); + Dimensions = Dimensions.reset_tuple_id(isl::dim::set); + + // Map dimensions of tensors to dimensions of imaginary matrices + isl::union_pw_aff IDimensions = + flattenDimensions(Dimensions, TCI.OrderedI, TCI.DimensionSizes); + isl::union_pw_aff JDimensions = + flattenDimensions(Dimensions, TCI.OrderedJ, TCI.DimensionSizes); + isl::union_pw_aff PDimensions = + flattenDimensions(Dimensions, TCI.OrderedP, TCI.DimensionSizes); + + unsigned DimNumber = unsignedFromIslSize(Dimensions.dim(isl::dim::out)); + Dimensions = isl::manage(isl_multi_union_pw_aff_drop_dims( + Dimensions.release(), isl_dim_out, 3, DimNumber - 3)); + Dimensions = Dimensions.set_union_pw_aff(0, IDimensions); + Dimensions = Dimensions.set_union_pw_aff(1, JDimensions); + Dimensions = Dimensions.set_union_pw_aff(2, PDimensions); + + // Modify the schedule tree + auto NodeType = isl_schedule_node_get_type(Node.get()); + while ((NodeType != isl_schedule_node_domain) && + (NodeType != isl_schedule_node_filter)) { + assert((NodeType != isl_schedule_node_sequence) && + L"Prevent the undefined behavior"); + Node = Node.parent(); + NodeType = isl_schedule_node_get_type(Node.get()); + } + Node = Node.child(0); + Node = isl::manage(isl_schedule_node_cut(Node.release())); + return Node.insert_partial_schedule(Dimensions); +} + +/// Apply the BLIS matmul optimization pattern if possible. +/// +/// Map dimensions of tensors to dimensions of imaginary matrices, +/// which are operands of the matrix multiplication. Using the described +/// one-to-one mapping, which is introduced in [1], the opimization of +/// the matrix multiplication of the corresponding imaginary matrices +/// can be performed for the tensor contraction. +/// +/// Specifically, make the loops containing the matrix multiplication be the +/// innermost loops and apply the BLIS matmul optimization pattern. BLIS +/// implements matrix multiplication as three nested loops around +/// a macro-kernel, plus two packing routines. The macro-kernel is implemented +/// in terms of two additional loops around a micro-kernel. +/// The micro-kernel is a loop around a rank-1 (i.e., outer product) update. +/// +/// For a detailed description please see [2]. +/// +/// The order of the loops defines the data reused in the BLIS implementation +/// of matrix multiplication ([2]). In particular, elements of the matrix B, +/// the second operand of matrix multiplication, are reused between +/// iterations of the innermost loop. To keep the reused data in cache, only +/// elements of matrix A, the first operand of matrix multiplication, +/// should be evicted during an iteration of the innermost loop. To provide +/// such a cache replacement policy, elements of the matrix A can, +/// in particular, be loaded first and, consequently, be least-recently-used. +/// +/// In our case matrices are stored in row-major order instead of +/// column-major order used in the BLIS implementation ([2]). It affects only +/// on the form of the BLIS micro kernel and the computation of its +/// parameters. In particular, reused elements of the matrix B are +/// successively multiplied by specific elements of the matrix A. +/// +/// Refs.: +/// [1] - Devin Matthews. High-Performance Tensor Contraction without BLAS. +/// SIAM Journal on Scientific Computing, 40(1). 2016. DOI: 10.1137/16M108968X. +/// [2] - Analytical Modeling is Enough for High Performance BLIS +/// Tze Meng Low, Francisco D Igual, Tyler M Smith, Enrique S Quintana-Orti +/// Technical Report, 2014 +/// http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf +/// +/// @see ScheduleTreeOptimizer::createMicroKernel +/// @see ScheduleTreeOptimizer::createMacroKernel +/// @see getMicroKernelParams +/// @see getMacroKernelParams +/// +/// @param Node The node that contains a band to be optimized. The node +/// is required to successfully pass +/// ScheduleTreeOptimizer::isTCPattern. +/// @param TTI Target Transform Info. +/// @param D The dependencies. +/// @param TCI The information about the tensor contraction. +/// +/// @returns The transformed schedule or nullptr if the optimization +/// cannot be applied. +static isl::schedule_node +optimizeTCPattern(isl::schedule_node Node, const llvm::TargetTransformInfo *TTI, + TCInfoTy &TCI) { + orderDimensions(TCI); + + assert(TTI && "The target transform info should be provided."); + Node = convertIntoMMMSchedule(Node, TTI, TCI); + + MatMulInfoTy MMI = convertIntoMatMulInfo(TCI); + MicroKernelParamsTy MicroKernelParams = getMicroKernelParams(TTI, MMI); + MacroKernelParamsTy MacroKernelParams = + getMacroKernelParams(TTI, MicroKernelParams, MMI); + Node = createMacroKernel(Node, MacroKernelParams); + Node = createMicroKernel(Node, MicroKernelParams); + if (MacroKernelParams.Mc == 1 || MacroKernelParams.Nc == 1 || + MacroKernelParams.Kc == 1) + return Node; + isl::map MapOldIndVar = getInductionVariablesSubstitution( + Node, MicroKernelParams, MacroKernelParams); + if (MapOldIndVar.is_null()) + return Node; + Node = markLoopVectorizerDisabled(Node.parent()).child(0); + Node = isolateAndUnrollMatMulInnerLoops(Node, MicroKernelParams); + return optimizeDataLayoutMatrMulPattern(Node, MapOldIndVar, MicroKernelParams, + MacroKernelParams, MMI); +} + } // namespace isl::schedule_node @@ -1825,8 +2106,10 @@ const llvm::TargetTransformInfo *TTI, const Dependences *D) { TCInfoTy TCI; - if (PMBasedTCOpts && isTCPattern(Node, D, TCI)) + if (PMBasedTCOpts && isTCPattern(Node, D, TCI)) { LLVM_DEBUG(dbgs() << "The tensor contraction pattern was detected\n"); + return optimizeTCPattern(Node, TTI, TCI); + } MatMulInfoTy MMI; if (PMBasedMMMOpts && isMatrMultPattern(Node, D, MMI)) { LLVM_DEBUG(dbgs() << "The matrix multiplication pattern was detected\n"); Index: polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll =================================================================== --- polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll +++ polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll @@ -30,9 +30,9 @@ ; CHECK-NEXT: } ; CHECK-NEXT: // 1st level tiling - Tiles ; CHECK-NEXT: for (int c1 = 0; c1 <= 3; c1 += 1) { -; CHECK-NEXT: for (int c3 = 0; c3 <= 1055; c3 += 1) -; CHECK-NEXT: for (int c4 = 256 * c1; c4 <= min(1022, 256 * c1 + 255); c4 += 1) -; CHECK-NEXT: CopyStmt_0(0, c3, c4); +; CHECK-NEXT: for (int c4 = 256 * c1; c4 <= min(1022, 256 * c1 + 255); c4 += 1) +; CHECK-NEXT: for (int c5 = 0; c5 <= 1055; c5 += 1) +; CHECK-NEXT: CopyStmt_0(0, c1, c4, c5); ; CHECK-NEXT: for (int c2 = 0; c2 <= 10; c2 += 1) { ; CHECK-NEXT: for (int c6 = 96 * c2; c6 <= 96 * c2 + 95; c6 += 1) ; CHECK-NEXT: for (int c7 = 256 * c1; c7 <= min(1022, 256 * c1 + 255); c7 += 1) Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll =================================================================== --- polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll @@ -1,7 +1,8 @@ ; RUN: opt %loadPolly \ ; RUN: -polly-pattern-matching-based-opts=true \ ; RUN: -polly-optree -polly-delicm -polly-simplify \ -; RUN: -polly-opt-isl -polly-tc-opt=true -debug -disable-output < %s 2>&1 \ +; RUN: -polly-opt-isl -polly-tc-opt=true -polly-ast-detect-parallel \ +; RUN: -polly-print-ast -disable-output < %s 2>&1 \ ; RUN: | FileCheck %s ; REQUIRES: asserts @@ -39,8 +40,77 @@ ; permutable: 1 ; coincident: [ 1, 1, 0 ] ; -; CHECK: The tensor contraction pattern was detected -; CHECK: The matrix multiplication pattern was detected +; CHECK: { +; CHECK-NEXT: // 1st level tiling - Tiles +; CHECK-NEXT: for (int c0 = 0; c0 <= 49; c0 += 1) +; CHECK-NEXT: for (int c1 = 0; c1 <= 56; c1 += 1) { +; CHECK-NEXT: // 1st level tiling - Points +; CHECK-NEXT: for (int c2 = 0; c2 <= 31; c2 += 1) +; CHECK-NEXT: for (int c3 = 0; c3 <= min(31, -32 * c1 + 1799); c3 += 1) +; CHECK-NEXT: Stmt_for_body3(32 * c0 + c2, 32 * c1 + c3); +; CHECK-NEXT: } +; CHECK-NEXT: // 1st level tiling - Tiles +; CHECK-NEXT: for (int c0 = 0; c0 <= 49; c0 += 1) +; CHECK-NEXT: for (int c1 = 0; c1 <= 56; c1 += 1) { +; CHECK-NEXT: // 1st level tiling - Points +; CHECK-NEXT: for (int c2 = 0; c2 <= 31; c2 += 1) +; CHECK-NEXT: for (int c3 = 0; c3 <= min(31, -32 * c1 + 1799); c3 += 1) +; CHECK-NEXT: Stmt_for_body3_last(32 * c0 + c2, 32 * c1 + c3); +; CHECK-NEXT: } +; CHECK-NEXT: // 1st level tiling - Tiles +; CHECK-NEXT: for (int c1 = 0; c1 <= 8; c1 += 1) { +; CHECK-NEXT: for (int c4 = 256 * c1; c4 <= min(2199, 256 * c1 + 255); c4 += 1) +; CHECK-NEXT: for (int c5 = 0; c5 <= 1799; c5 += 1) +; CHECK-NEXT: CopyStmt_0(0, c1, c4, c5); +; CHECK-NEXT: for (int c2 = 0; c2 <= 16; c2 += 1) { +; CHECK-NEXT: for (int c6 = 96 * c2; c6 <= min(1599, 96 * c2 + 95); c6 += 1) +; CHECK-NEXT: for (int c7 = 256 * c1; c7 <= min(2199, 256 * c1 + 255); c7 += 1) +; CHECK-NEXT: CopyStmt_1(0, c1, c2, c6, c7); +; CHECK-NEXT: // 1st level tiling - Points +; CHECK-NEXT: // Register tiling - Tiles +; CHECK-NEXT: for (int c3 = 0; c3 <= 224; c3 += 1) +; CHECK-NEXT: for (int c4 = 0; c4 <= min(23, -24 * c2 + 399); c4 += 1) +; CHECK-NEXT: for (int c5 = 0; c5 <= min(255, -256 * c1 + 2199); c5 += 1) { +; CHECK-NEXT: // Loop Vectorizer Disabled +; CHECK-NEXT: // Register tiling - Points +; CHECK-NEXT: { +; CHECK-NEXT: Stmt_for_body8(96 * c2 + 4 * c4, 8 * c3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body8(96 * c2 + 4 * c4, 8 * c3 + 1, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body8(96 * c2 + 4 * c4, 8 * c3 + 2, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body8(96 * c2 + 4 * c4, 8 * c3 + 3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body8(96 * c2 + 4 * c4, 8 * c3 + 4, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body8(96 * c2 + 4 * c4, 8 * c3 + 5, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body8(96 * c2 + 4 * c4, 8 * c3 + 6, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body8(96 * c2 + 4 * c4, 8 * c3 + 7, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body8(96 * c2 + 4 * c4 + 1, 8 * c3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body8(96 * c2 + 4 * c4 + 1, 8 * c3 + 1, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body8(96 * c2 + 4 * c4 + 1, 8 * c3 + 2, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body8(96 * c2 + 4 * c4 + 1, 8 * c3 + 3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body8(96 * c2 + 4 * c4 + 1, 8 * c3 + 4, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body8(96 * c2 + 4 * c4 + 1, 8 * c3 + 5, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body8(96 * c2 + 4 * c4 + 1, 8 * c3 + 6, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body8(96 * c2 + 4 * c4 + 1, 8 * c3 + 7, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body8(96 * c2 + 4 * c4 + 2, 8 * c3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body8(96 * c2 + 4 * c4 + 2, 8 * c3 + 1, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body8(96 * c2 + 4 * c4 + 2, 8 * c3 + 2, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body8(96 * c2 + 4 * c4 + 2, 8 * c3 + 3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body8(96 * c2 + 4 * c4 + 2, 8 * c3 + 4, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body8(96 * c2 + 4 * c4 + 2, 8 * c3 + 5, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body8(96 * c2 + 4 * c4 + 2, 8 * c3 + 6, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body8(96 * c2 + 4 * c4 + 2, 8 * c3 + 7, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body8(96 * c2 + 4 * c4 + 3, 8 * c3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body8(96 * c2 + 4 * c4 + 3, 8 * c3 + 1, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body8(96 * c2 + 4 * c4 + 3, 8 * c3 + 2, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body8(96 * c2 + 4 * c4 + 3, 8 * c3 + 3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body8(96 * c2 + 4 * c4 + 3, 8 * c3 + 4, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body8(96 * c2 + 4 * c4 + 3, 8 * c3 + 5, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body8(96 * c2 + 4 * c4 + 3, 8 * c3 + 6, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body8(96 * c2 + 4 * c4 + 3, 8 * c3 + 7, 256 * c1 + c5); +; CHECK-NEXT: } +; CHECK-NEXT: } +; CHECK-NEXT: } +; CHECK-NEXT: } +; CHECK-NEXT: } ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll =================================================================== --- polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll @@ -17,7 +17,6 @@ ; CHECK-NOT: The matrix multiplication pattern was detected ; CHECK-NOT: The tensor contraction pattern was detected ; PATTERN-MATCHING-OPTS: The tensor contraction pattern was detected -; PATTERN-MATCHING-OPTS: The matrix multiplication pattern was detected ; PARALLEL-AST-NOT: #pragma known-parallel ; STATS: 1 polly-opt-isl - Number of matrix multiplication patterns detected and optimized ; Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll =================================================================== --- polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll @@ -18,7 +18,6 @@ ; arrays. ; ; CHECK: The tensor contraction pattern was detected -; CHECK: The matrix multiplication pattern was detected ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown" Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll =================================================================== --- polly/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll @@ -13,9 +13,9 @@ ; CHECK: // 1st level tiling - Tiles ; CHECK-NEXT: for (int c0 = 0; c0 <= 1; c0 += 1) ; CHECK-NEXT: for (int c1 = 0; c1 <= 6; c1 += 1) { -; CHECK-NEXT: for (int c3 = 1536 * c0; c3 <= min(1999, 1536 * c0 + 1535); c3 += 1) -; CHECK-NEXT: for (int c4 = 307 * c1; c4 <= min(1999, 307 * c1 + 306); c4 += 1) -; CHECK-NEXT: CopyStmt_0(0, c3, c4); +; CHECK-NEXT: for (int c4 = 307 * c1; c4 <= min(1999, 307 * c1 + 306); c4 += 1) +; CHECK-NEXT: for (int c5 = 1536 * c0; c5 <= min(1999, 1536 * c0 + 1535); c5 += 1) +; CHECK-NEXT: CopyStmt_0(c0, c1, c4, c5); ; CHECK-NEXT: for (int c2 = 0; c2 <= 24; c2 += 1) { ; CHECK-NEXT: for (int c6 = 80 * c2; c6 <= 80 * c2 + 79; c6 += 1) ; CHECK-NEXT: for (int c7 = 307 * c1; c7 <= min(1999, 307 * c1 + 306); c7 += 1) Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll =================================================================== --- polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll @@ -88,9 +88,9 @@ ; EXTRACTION-OF-MACRO-KERNEL-NEXT: } ; EXTRACTION-OF-MACRO-KERNEL-NEXT: // 1st level tiling - Tiles ; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c1 = 0; c1 <= 3; c1 += 1) { -; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c3 = 0; c3 <= 1055; c3 += 1) -; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c4 = 256 * c1; c4 <= 256 * c1 + 255; c4 += 1) -; EXTRACTION-OF-MACRO-KERNEL-NEXT: CopyStmt_0(0, c3, c4); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c4 = 256 * c1; c4 <= 256 * c1 + 255; c4 += 1) +; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c5 = 0; c5 <= 1055; c5 += 1) +; EXTRACTION-OF-MACRO-KERNEL-NEXT: CopyStmt_0(0, c1, c4, c5); ; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c2 = 0; c2 <= 10; c2 += 1) { ; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c6 = 96 * c2; c6 <= 96 * c2 + 95; c6 += 1) ; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c7 = 256 * c1; c7 <= 256 * c1 + 255; c7 += 1) Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll =================================================================== --- polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll @@ -22,13 +22,12 @@ ; C[i][j] += A[i][k] * B[k][j]; ; ; CHECK: The tensor contraction pattern was detected -; CHECK: The matrix multiplication pattern was detected ; ; PATTERN-MATCHING-OPTS: // 1st level tiling - Tiles ; PATTERN-MATCHING-OPTS-NEXT: for (int c1 = 0; c1 <= 3; c1 += 1) { -; PATTERN-MATCHING-OPTS-NEXT: for (int c3 = 256 * c1; c3 <= 256 * c1 + 255; c3 += 1) -; PATTERN-MATCHING-OPTS-NEXT: for (int c4 = 0; c4 <= 1023; c4 += 1) -; PATTERN-MATCHING-OPTS-NEXT: CopyStmt_0(0, c3, c4); +; PATTERN-MATCHING-OPTS-NEXT: for (int c4 = 256 * c1; c4 <= 256 * c1 + 255; c4 += 1) +; PATTERN-MATCHING-OPTS-NEXT: for (int c5 = 0; c5 <= 1023; c5 += 1) +; PATTERN-MATCHING-OPTS-NEXT: CopyStmt_0(0, c1, c4, c5); ; PATTERN-MATCHING-OPTS-NEXT: for (int c2 = 0; c2 <= 10; c2 += 1) { ; PATTERN-MATCHING-OPTS-NEXT: for (int c6 = 96 * c2; c6 <= min(1023, 96 * c2 + 95); c6 += 1) ; PATTERN-MATCHING-OPTS-NEXT: for (int c7 = 256 * c1; c7 <= 256 * c1 + 255; c7 += 1) Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_5.ll =================================================================== --- polly/test/ScheduleOptimizer/pattern-matching-based-opts_5.ll +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_5.ll @@ -40,9 +40,9 @@ ; CHECK-NEXT: // 1st level tiling - Tiles ; CHECK-NEXT: for (int c0 = 0; c0 <= floord(nj - 1, 2048); c0 += 1) ; CHECK-NEXT: for (int c1 = 0; c1 <= floord(nk - 1, 256); c1 += 1) { -; CHECK-NEXT: for (int c3 = 2048 * c0; c3 <= min(nj - 1, 2048 * c0 + 2047); c3 += 1) -; CHECK-NEXT: for (int c4 = 256 * c1; c4 <= min(nk - 1, 256 * c1 + 255); c4 += 1) -; CHECK-NEXT: CopyStmt_0(0, c3, c4); +; CHECK-NEXT: for (int c4 = 256 * c1; c4 <= min(nk - 1, 256 * c1 + 255); c4 += 1) +; CHECK-NEXT: for (int c5 = 2048 * c0; c5 <= min(nj - 1, 2048 * c0 + 2047); c5 += 1) +; CHECK-NEXT: CopyStmt_0(c0, c1, c4, c5); ; CHECK-NEXT: for (int c2 = 0; c2 <= floord(ni - 1, 96); c2 += 1) { ; CHECK-NEXT: for (int c6 = 96 * c2; c6 <= min(ni - 1, 96 * c2 + 95); c6 += 1) ; CHECK-NEXT: for (int c7 = 256 * c1; c7 <= min(nk - 1, 256 * c1 + 255); c7 += 1) Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_6.ll =================================================================== --- polly/test/ScheduleOptimizer/pattern-matching-based-opts_6.ll +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_6.ll @@ -40,9 +40,9 @@ ; ; CHECK: // 1st level tiling - Tiles ; CHECK-NEXT: for (int c1 = 0; c1 <= 3; c1 += 1) { -; CHECK-NEXT: for (int c3 = 0; c3 <= 1019; c3 += 1) -; CHECK-NEXT: for (int c4 = 256 * c1; c4 <= min(1019, 256 * c1 + 255); c4 += 1) -; CHECK-NEXT: CopyStmt_0(0, c3, c4); +; CHECK-NEXT: for (int c4 = 256 * c1; c4 <= min(1019, 256 * c1 + 255); c4 += 1) +; CHECK-NEXT: for (int c5 = 0; c5 <= 1019; c5 += 1) +; CHECK-NEXT: CopyStmt_0(0, c1, c4, c5); ; CHECK-NEXT: for (int c2 = 0; c2 <= 10; c2 += 1) { ; CHECK-NEXT: for (int c6 = 96 * c2; c6 <= min(1019, 96 * c2 + 95); c6 += 1) ; CHECK-NEXT: for (int c7 = 256 * c1; c7 <= min(1019, 256 * c1 + 255); c7 += 1) Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_7.ll =================================================================== --- polly/test/ScheduleOptimizer/pattern-matching-based-opts_7.ll +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_7.ll @@ -24,9 +24,9 @@ ; ; CHECK: // 1st level tiling - Tiles ; CHECK-NEXT: for (int c1 = 0; c1 <= 2; c1 += 1) { -; CHECK-NEXT: for (int c3 = 0; c3 <= 1023; c3 += 1) -; CHECK-NEXT: for (int c4 = 384 * c1; c4 <= min(1023, 384 * c1 + 383); c4 += 1) -; CHECK-NEXT: CopyStmt_0(0, c3, c4); +; CHECK-NEXT: for (int c4 = 384 * c1; c4 <= min(1023, 384 * c1 + 383); c4 += 1) +; CHECK-NEXT: for (int c5 = 0; c5 <= 1023; c5 += 1) +; CHECK-NEXT: CopyStmt_0(0, c1, c4, c5); ; CHECK-NEXT: for (int c2 = 0; c2 <= 7; c2 += 1) { ; CHECK-NEXT: for (int c6 = 128 * c2; c6 <= 128 * c2 + 127; c6 += 1) ; CHECK-NEXT: for (int c7 = 384 * c1; c7 <= min(1023, 384 * c1 + 383); c7 += 1) Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_8.ll =================================================================== --- polly/test/ScheduleOptimizer/pattern-matching-based-opts_8.ll +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_8.ll @@ -25,9 +25,9 @@ ; ; CHECK: // 1st level tiling - Tiles ; CHECK-NEXT: for (int c1 = 0; c1 <= 3; c1 += 1) { -; CHECK-NEXT: for (int c3 = 0; c3 <= 1023; c3 += 1) -; CHECK-NEXT: for (int c4 = 256 * c1; c4 <= 256 * c1 + 255; c4 += 1) -; CHECK-NEXT: CopyStmt_0(0, c3, c4); +; CHECK-NEXT: for (int c4 = 256 * c1; c4 <= 256 * c1 + 255; c4 += 1) +; CHECK-NEXT: for (int c5 = 0; c5 <= 1023; c5 += 1) +; CHECK-NEXT: CopyStmt_0(0, c1, c4, c5); ; CHECK-NEXT: for (int c2 = 0; c2 <= 10; c2 += 1) { ; CHECK-NEXT: for (int c6 = 96 * c2; c6 <= min(1023, 96 * c2 + 95); c6 += 1) ; CHECK-NEXT: for (int c7 = 256 * c1; c7 <= 256 * c1 + 255; c7 += 1)