Index: polly/trunk/include/polly/ScheduleOptimizer.h =================================================================== --- polly/trunk/include/polly/ScheduleOptimizer.h +++ polly/trunk/include/polly/ScheduleOptimizer.h @@ -134,16 +134,29 @@ /// a micro-kernel. The micro-kernel is a loop around a rank-1 /// (i.e., outer product) update. /// - /// For a detailed description please see: - /// Analytical Modeling is Enough for High Performance BLIS + /// For a detailed description please see [1]. + /// + /// The order of the loops defines the data reused in the BLIS implementation + /// of gemm ([1]). In particular, elements of the matrix B, the second + /// operand of matrix multiplication, are reused between iterations of the + /// innermost loop. To keep the reused data in cache, only elements of matrix + /// A, the first operand of matrix multiplication, should be evicted during + /// an iteration of the innermost loop. To provide such a cache replacement + /// policy, elements of the matrix A can, in particular, be loaded first and, + /// consequently, be least-recently-used. + /// + /// In our case matrices are stored in row-major order instead of + /// column-major order used in the BLIS implementation ([1]). It affects only + /// on the form of the BLIS micro kernel and the computation of its + /// parameters. In particular, reused elements of the matrix B are + /// successively multiplied by specific elements of the matrix A. + /// + /// Refs.: + /// [1] - Analytical Modeling is Enough for High Performance BLIS /// Tze Meng Low, Francisco D Igual, Tyler M Smith, Enrique S Quintana-Orti /// Technical Report, 2014 /// http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf /// - /// In our case matrices are stored in row-major order, which is taken into - /// account during the creation of the BLIS kernels and the computation - /// of their parameters. - /// /// @see ScheduleTreeOptimizer::createMicroKernel /// @see ScheduleTreeOptimizer::createMacroKernel /// @see getMicroKernelParams Index: polly/trunk/lib/Transform/ScheduleOptimizer.cpp =================================================================== --- polly/trunk/lib/Transform/ScheduleOptimizer.cpp +++ polly/trunk/lib/Transform/ScheduleOptimizer.cpp @@ -538,8 +538,10 @@ __isl_give isl_schedule_node *ScheduleTreeOptimizer::createMicroKernel( __isl_take isl_schedule_node *Node, MicroKernelParamsTy MicroKernelParams) { - return applyRegisterTiling(Node, {MicroKernelParams.Mr, MicroKernelParams.Nr}, - 1); + applyRegisterTiling(Node, {MicroKernelParams.Mr, MicroKernelParams.Nr}, 1); + Node = isl_schedule_node_parent(isl_schedule_node_parent(Node)); + Node = permuteBandNodeDimensions(Node, 0, 1); + return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0); } __isl_give isl_schedule_node *ScheduleTreeOptimizer::createMacroKernel( @@ -553,6 +555,7 @@ {MacroKernelParams.Mc, MacroKernelParams.Nc, MacroKernelParams.Kc}, 1); Node = isl_schedule_node_parent(isl_schedule_node_parent(Node)); Node = permuteBandNodeDimensions(Node, 1, 2); + Node = permuteBandNodeDimensions(Node, 0, 2); return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0); } @@ -609,18 +612,15 @@ return {1, 1, 1}; int Cbr = floor( (CacheLevelAssociativity[0] - 1) / - (1 + static_cast(MicroKernelParams.Mr) / MicroKernelParams.Nr)); + (1 + static_cast(MicroKernelParams.Nr) / MicroKernelParams.Mr)); int Kc = (Cbr * CacheLevelSizes[0]) / - (MicroKernelParams.Nr * CacheLevelAssociativity[0] * 8); - double Cac = static_cast(MicroKernelParams.Mr * Kc * 8 * - CacheLevelAssociativity[1]) / + (MicroKernelParams.Mr * CacheLevelAssociativity[0] * 8); + double Cac = static_cast(Kc * 8 * CacheLevelAssociativity[1]) / CacheLevelSizes[1]; - double Cbc = static_cast(MicroKernelParams.Nr * Kc * 8 * - CacheLevelAssociativity[1]) / + double Cbc = static_cast(Kc * 8 * CacheLevelAssociativity[1]) / CacheLevelSizes[1]; - int Mc = floor(MicroKernelParams.Mr / Cac); - int Nc = - floor((MicroKernelParams.Nr * (CacheLevelAssociativity[1] - 2)) / Cbc); + int Mc = floor((CacheLevelAssociativity[1] - 2) / Cac); + int Nc = floor(1 / Cbc); return {Mc, Nc, Kc}; } @@ -867,36 +867,38 @@ Node = isl_schedule_node_parent(Node); Node = isl_schedule_node_child(isl_schedule_node_band_split(Node, 2), 0); auto *AccRel = - getMatMulAccRel(isl_map_copy(MapOldIndVar), MacroParams.Kc, 3, 6); - unsigned FirstDimSize = MacroParams.Mc * MacroParams.Kc / MicroParams.Mr; - unsigned SecondDimSize = MicroParams.Mr; + getMatMulAccRel(isl_map_copy(MapOldIndVar), MacroParams.Kc, 3, 7); + unsigned FirstDimSize = MacroParams.Nc * MacroParams.Kc / MicroParams.Nr; + unsigned SecondDimSize = MicroParams.Nr; auto *SAI = Stmt->getParent()->createScopArrayInfo( - MemAccessA->getElementType(), "Packed_A", {FirstDimSize, SecondDimSize}); + MemAccessB->getElementType(), "Packed_B", {FirstDimSize, SecondDimSize}); AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId()); - auto *OldAcc = MemAccessA->getAccessRelation(); - MemAccessA->setNewAccessRelation(AccRel); + auto *OldAcc = MemAccessB->getAccessRelation(); + MemAccessB->setNewAccessRelation(AccRel); auto *ExtMap = - getMatMulExt(Stmt->getIslCtx(), MacroParams.Mc, 0, MacroParams.Kc); - ExtMap = isl_map_project_out(ExtMap, isl_dim_in, 1, 1); + getMatMulExt(Stmt->getIslCtx(), 0, MacroParams.Nc, MacroParams.Kc); + isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 0, 1); + isl_map_move_dims(ExtMap, isl_dim_in, 2, isl_dim_out, 0, 1); + ExtMap = isl_map_project_out(ExtMap, isl_dim_in, 2, 1); auto *Domain = Stmt->getDomain(); auto *NewStmt = Stmt->getParent()->addScopStmt( - OldAcc, MemAccessA->getAccessRelation(), isl_set_copy(Domain)); + OldAcc, MemAccessB->getAccessRelation(), isl_set_copy(Domain)); ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId()); Node = createExtensionNode(Node, ExtMap); Node = isl_schedule_node_child(Node, 0); - AccRel = getMatMulAccRel(MapOldIndVar, MacroParams.Kc, 4, 7); - FirstDimSize = MacroParams.Nc * MacroParams.Kc / MicroParams.Nr; - SecondDimSize = MicroParams.Nr; + AccRel = getMatMulAccRel(MapOldIndVar, MacroParams.Kc, 4, 6); + FirstDimSize = MacroParams.Mc * MacroParams.Kc / MicroParams.Mr; + SecondDimSize = MicroParams.Mr; SAI = Stmt->getParent()->createScopArrayInfo( - MemAccessB->getElementType(), "Packed_B", {FirstDimSize, SecondDimSize}); + MemAccessA->getElementType(), "Packed_A", {FirstDimSize, SecondDimSize}); AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId()); - OldAcc = MemAccessB->getAccessRelation(); - MemAccessB->setNewAccessRelation(AccRel); - ExtMap = getMatMulExt(Stmt->getIslCtx(), 0, MacroParams.Nc, MacroParams.Kc); - isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 1, 1); + OldAcc = MemAccessA->getAccessRelation(); + MemAccessA->setNewAccessRelation(AccRel); + ExtMap = getMatMulExt(Stmt->getIslCtx(), MacroParams.Mc, 0, MacroParams.Kc); + isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 0, 1); isl_map_move_dims(ExtMap, isl_dim_in, 2, isl_dim_out, 0, 1); NewStmt = Stmt->getParent()->addScopStmt( - OldAcc, MemAccessB->getAccessRelation(), Domain); + OldAcc, MemAccessA->getAccessRelation(), Domain); ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId()); Node = createExtensionNode(Node, ExtMap); Node = isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0); Index: polly/trunk/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll =================================================================== --- polly/trunk/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll +++ polly/trunk/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll @@ -9,37 +9,37 @@ ; C[i][j] += alpha * A[i][k] * B[k][j]; ; } ; -; CHECK: double Packed_A[ { [] -> [(1024)] } ][ { [] -> [(4)] } ]; // Element size 8 -; CHECK: double Packed_B[ { [] -> [(3072)] } ][ { [] -> [(8)] } ]; // Element size 8 +; CHECK: double Packed_B[ { [] -> [(512)] } ][ { [] -> [(8)] } ]; // Element size 8 +; CHECK-NEXT: double Packed_A[ { [] -> [(6144)] } ][ { [] -> [(4)] } ]; // Element size 8 ; ; CHECK: { Stmt_Copy_0[i0, i1, i2] -> MemRef_arg6[i0, i2] }; -; CHECK: new: { Stmt_Copy_0[i0, i1, i2] -> Packed_A[o0, o1] : 256*floor((-i2 + o0)/256) = -i2 + o0 and 4*floor((-i0 + o1)/4) = -i0 + o1 and 0 <= o1 <= 3 and -3 + i0 - 16*floor((i0)/16) <= 4*floor((o0)/256) <= i0 - 16*floor((i0)/16) }; +; CHECK-NEXT: new: { Stmt_Copy_0[i0, i1, i2] -> Packed_A[o0, o1] : 256*floor((-i2 + o0)/256) = -i2 + o0 and 4*floor((-i0 + o1)/4) = -i0 + o1 and 0 <= o1 <= 3 and -3 + i0 - 96*floor((i0)/96) <= 4*floor((o0)/256) <= i0 - 96*floor((i0)/96) }; ; ; CHECK: { Stmt_Copy_0[i0, i1, i2] -> MemRef_arg7[i2, i1] }; -; CHECK: new: { Stmt_Copy_0[i0, i1, i2] -> Packed_B[o0, o1] : 256*floor((-i2 + o0)/256) = -i2 + o0 and 8*floor((-i1 + o1)/8) = -i1 + o1 and 0 <= o1 <= 7 and -7 + i1 - 96*floor((i1)/96) <= 8*floor((o0)/256) <= i1 - 96*floor((i1)/96) }; +; CHECK-NEXT: new: { Stmt_Copy_0[i0, i1, i2] -> Packed_B[o0, o1] : 256*floor((-i2 + o0)/256) = -i2 + o0 and 8*floor((-i1 + o1)/8) = -i1 + o1 and 0 <= o1 <= 7 and -7 + i1 - 16*floor((i1)/16) <= 8*floor((o0)/256) <= i1 - 16*floor((i1)/16) }; ; ; CHECK: CopyStmt_0 -; CHECK: Domain := -; CHECK: { CopyStmt_0[i0, i1, i2] : 0 <= i0 <= 1055 and 0 <= i1 <= 1055 and 0 <= i2 <= 1023 }; -; CHECK: Schedule := -; CHECK: ; -; CHECK: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0] -; CHECK: null; -; CHECK: new: { CopyStmt_0[i0, i1, i2] -> Packed_A[o0, o1] : 256*floor((-i2 + o0)/256) = -i2 + o0 and 4*floor((-i0 + o1)/4) = -i0 + o1 and 0 <= o1 <= 3 and -3 + i0 - 16*floor((i0)/16) <= 4*floor((o0)/256) <= i0 - 16*floor((i0)/16) }; -; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; CHECK: null; -; CHECK: new: { CopyStmt_0[i0, i1, i2] -> MemRef_arg6[i0, i2] }; -; CHECK: CopyStmt_1 -; CHECK: Domain := -; CHECK: { CopyStmt_1[i0, i1, i2] : 0 <= i0 <= 1055 and 0 <= i1 <= 1055 and 0 <= i2 <= 1023 }; -; CHECK: Schedule := -; CHECK: ; -; CHECK: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0] -; CHECK: null; -; CHECK: new: { CopyStmt_1[i0, i1, i2] -> Packed_B[o0, o1] : 256*floor((-i2 + o0)/256) = -i2 + o0 and 8*floor((-i1 + o1)/8) = -i1 + o1 and 0 <= o1 <= 7 and -7 + i1 - 96*floor((i1)/96) <= 8*floor((o0)/256) <= i1 - 96*floor((i1)/96) }; -; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; CHECK: null; -; CHECK: new: { CopyStmt_1[i0, i1, i2] -> MemRef_arg7[i2, i1] }; +; CHECK-NEXT: Domain := +; CHECK-NEXT: { CopyStmt_0[i0, i1, i2] : 0 <= i0 <= 1055 and 0 <= i1 <= 1055 and 0 <= i2 <= 1023 }; +; CHECK-NEXT: Schedule := +; CHECK-NEXT: ; +; CHECK-NEXT: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: null; +; CHECK-NEXT: new: { CopyStmt_0[i0, i1, i2] -> Packed_B[o0, o1] : 256*floor((-i2 + o0)/256) = -i2 + o0 and 8*floor((-i1 + o1)/8) = -i1 + o1 and 0 <= o1 <= 7 and -7 + i1 - 16*floor((i1)/16) <= 8*floor((o0)/256) <= i1 - 16*floor((i1)/16) }; +; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: null; +; CHECK-NEXT: new: { CopyStmt_0[i0, i1, i2] -> MemRef_arg7[i2, i1] }; +; CHECK-NEXT: CopyStmt_1 +; CHECK-NEXT: Domain := +; CHECK-NEXT: { CopyStmt_1[i0, i1, i2] : 0 <= i0 <= 1055 and 0 <= i1 <= 1055 and 0 <= i2 <= 1023 }; +; CHECK-NEXT: Schedule := +; CHECK-NEXT: ; +; CHECK-NEXT: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: null; +; CHECK-NEXT: new: { CopyStmt_1[i0, i1, i2] -> Packed_A[o0, o1] : 256*floor((-i2 + o0)/256) = -i2 + o0 and 4*floor((-i0 + o1)/4) = -i0 + o1 and 0 <= o1 <= 3 and -3 + i0 - 96*floor((i0)/96) <= 4*floor((o0)/256) <= i0 - 96*floor((i0)/96) }; +; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: null; +; CHECK-NEXT: new: { CopyStmt_1[i0, i1, i2] -> MemRef_arg6[i0, i2] }; ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown" Index: polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll =================================================================== --- polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll +++ polly/trunk/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll @@ -11,107 +11,116 @@ ; } ; ; CHECK: { -; CHECK: // 1st level tiling - Tiles -; CHECK: for (int c0 = 0; c0 <= 32; c0 += 1) -; CHECK: for (int c1 = 0; c1 <= 32; c1 += 1) { -; CHECK: // 1st level tiling - Points -; CHECK: for (int c2 = 0; c2 <= 31; c2 += 1) -; CHECK: for (int c3 = 0; c3 <= 31; c3 += 1) -; CHECK: Stmt_bb14(32 * c0 + c2, 32 * c1 + c3); -; CHECK: } -; CHECK: // Register tiling - Tiles -; CHECK: for (int c0 = 0; c0 <= 263; c0 += 1) -; CHECK: for (int c1 = 0; c1 <= 131; c1 += 1) -; CHECK: for (int c2 = 0; c2 <= 1023; c2 += 1) { -; CHECK: // Register tiling - Points -; CHECK: // 1st level tiling - Tiles -; CHECK: // 1st level tiling - Points -; CHECK: { -; CHECK: Stmt_bb24(4 * c0, 8 * c1, c2); -; CHECK: Stmt_bb24(4 * c0, 8 * c1 + 1, c2); -; CHECK: Stmt_bb24(4 * c0, 8 * c1 + 2, c2); -; CHECK: Stmt_bb24(4 * c0, 8 * c1 + 3, c2); -; CHECK: Stmt_bb24(4 * c0, 8 * c1 + 4, c2); -; CHECK: Stmt_bb24(4 * c0, 8 * c1 + 5, c2); -; CHECK: Stmt_bb24(4 * c0, 8 * c1 + 6, c2); -; CHECK: Stmt_bb24(4 * c0, 8 * c1 + 7, c2); -; CHECK: Stmt_bb24(4 * c0 + 1, 8 * c1, c2); -; CHECK: Stmt_bb24(4 * c0 + 1, 8 * c1 + 1, c2); -; CHECK: Stmt_bb24(4 * c0 + 1, 8 * c1 + 2, c2); -; CHECK: Stmt_bb24(4 * c0 + 1, 8 * c1 + 3, c2); -; CHECK: Stmt_bb24(4 * c0 + 1, 8 * c1 + 4, c2); -; CHECK: Stmt_bb24(4 * c0 + 1, 8 * c1 + 5, c2); -; CHECK: Stmt_bb24(4 * c0 + 1, 8 * c1 + 6, c2); -; CHECK: Stmt_bb24(4 * c0 + 1, 8 * c1 + 7, c2); -; CHECK: Stmt_bb24(4 * c0 + 2, 8 * c1, c2); -; CHECK: Stmt_bb24(4 * c0 + 2, 8 * c1 + 1, c2); -; CHECK: Stmt_bb24(4 * c0 + 2, 8 * c1 + 2, c2); -; CHECK: Stmt_bb24(4 * c0 + 2, 8 * c1 + 3, c2); -; CHECK: Stmt_bb24(4 * c0 + 2, 8 * c1 + 4, c2); -; CHECK: Stmt_bb24(4 * c0 + 2, 8 * c1 + 5, c2); -; CHECK: Stmt_bb24(4 * c0 + 2, 8 * c1 + 6, c2); -; CHECK: Stmt_bb24(4 * c0 + 2, 8 * c1 + 7, c2); -; CHECK: Stmt_bb24(4 * c0 + 3, 8 * c1, c2); -; CHECK: Stmt_bb24(4 * c0 + 3, 8 * c1 + 1, c2); -; CHECK: Stmt_bb24(4 * c0 + 3, 8 * c1 + 2, c2); -; CHECK: Stmt_bb24(4 * c0 + 3, 8 * c1 + 3, c2); -; CHECK: Stmt_bb24(4 * c0 + 3, 8 * c1 + 4, c2); -; CHECK: Stmt_bb24(4 * c0 + 3, 8 * c1 + 5, c2); -; CHECK: Stmt_bb24(4 * c0 + 3, 8 * c1 + 6, c2); -; CHECK: Stmt_bb24(4 * c0 + 3, 8 * c1 + 7, c2); -; CHECK: } -; CHECK: } -; CHECK: } +; CHECK-NEXT: // 1st level tiling - Tiles +; CHECK-NEXT: for (int c0 = 0; c0 <= 32; c0 += 1) +; CHECK-NEXT: for (int c1 = 0; c1 <= 32; c1 += 1) { +; CHECK-NEXT: // 1st level tiling - Points +; CHECK-NEXT: for (int c2 = 0; c2 <= 31; c2 += 1) +; CHECK-NEXT: for (int c3 = 0; c3 <= 31; c3 += 1) +; CHECK-NEXT: Stmt_bb14(32 * c0 + c2, 32 * c1 + c3); +; CHECK-NEXT: } +; CHECK-NEXT: // Register tiling - Tiles +; CHECK-NEXT: for (int c0 = 0; c0 <= 131; c0 += 1) +; CHECK-NEXT: for (int c1 = 0; c1 <= 263; c1 += 1) +; CHECK-NEXT: for (int c2 = 0; c2 <= 1023; c2 += 1) { +; CHECK-NEXT: // Register tiling - Points +; CHECK-NEXT: // 1st level tiling - Tiles +; CHECK-NEXT: // 1st level tiling - Points +; CHECK-NEXT: { +; CHECK-NEXT: Stmt_bb24(4 * c1, 8 * c0, c2); +; CHECK-NEXT: Stmt_bb24(4 * c1, 8 * c0 + 1, c2); +; CHECK-NEXT: Stmt_bb24(4 * c1, 8 * c0 + 2, c2); +; CHECK-NEXT: Stmt_bb24(4 * c1, 8 * c0 + 3, c2); +; CHECK-NEXT: Stmt_bb24(4 * c1, 8 * c0 + 4, c2); +; CHECK-NEXT: Stmt_bb24(4 * c1, 8 * c0 + 5, c2); +; CHECK-NEXT: Stmt_bb24(4 * c1, 8 * c0 + 6, c2); +; CHECK-NEXT: Stmt_bb24(4 * c1, 8 * c0 + 7, c2); +; CHECK-NEXT: Stmt_bb24(4 * c1 + 1, 8 * c0, c2); +; CHECK-NEXT: Stmt_bb24(4 * c1 + 1, 8 * c0 + 1, c2); +; CHECK-NEXT: Stmt_bb24(4 * c1 + 1, 8 * c0 + 2, c2); +; CHECK-NEXT: Stmt_bb24(4 * c1 + 1, 8 * c0 + 3, c2); +; CHECK-NEXT: Stmt_bb24(4 * c1 + 1, 8 * c0 + 4, c2); +; CHECK-NEXT: Stmt_bb24(4 * c1 + 1, 8 * c0 + 5, c2); +; CHECK-NEXT: Stmt_bb24(4 * c1 + 1, 8 * c0 + 6, c2); +; CHECK-NEXT: Stmt_bb24(4 * c1 + 1, 8 * c0 + 7, c2); +; CHECK-NEXT: Stmt_bb24(4 * c1 + 2, 8 * c0, c2); +; CHECK-NEXT: Stmt_bb24(4 * c1 + 2, 8 * c0 + 1, c2); +; CHECK-NEXT: Stmt_bb24(4 * c1 + 2, 8 * c0 + 2, c2); +; CHECK-NEXT: Stmt_bb24(4 * c1 + 2, 8 * c0 + 3, c2); +; CHECK-NEXT: Stmt_bb24(4 * c1 + 2, 8 * c0 + 4, c2); +; CHECK-NEXT: Stmt_bb24(4 * c1 + 2, 8 * c0 + 5, c2); +; CHECK-NEXT: Stmt_bb24(4 * c1 + 2, 8 * c0 + 6, c2); +; CHECK-NEXT: Stmt_bb24(4 * c1 + 2, 8 * c0 + 7, c2); +; CHECK-NEXT: Stmt_bb24(4 * c1 + 3, 8 * c0, c2); +; CHECK-NEXT: Stmt_bb24(4 * c1 + 3, 8 * c0 + 1, c2); +; CHECK-NEXT: Stmt_bb24(4 * c1 + 3, 8 * c0 + 2, c2); +; CHECK-NEXT: Stmt_bb24(4 * c1 + 3, 8 * c0 + 3, c2); +; CHECK-NEXT: Stmt_bb24(4 * c1 + 3, 8 * c0 + 4, c2); +; CHECK-NEXT: Stmt_bb24(4 * c1 + 3, 8 * c0 + 5, c2); +; CHECK-NEXT: Stmt_bb24(4 * c1 + 3, 8 * c0 + 6, c2); +; CHECK-NEXT: Stmt_bb24(4 * c1 + 3, 8 * c0 + 7, c2); +; CHECK-NEXT: } +; CHECK-NEXT: } +; CHECK-NEXT: } ; -; EXTRACTION-OF-MACRO-KERNEL: // 1st level tiling - Tiles -; EXTRACTION-OF-MACRO-KERNEL: for (int c0 = 0; c0 <= 65; c0 += 1) -; EXTRACTION-OF-MACRO-KERNEL: for (int c1 = 0; c1 <= 3; c1 += 1) -; EXTRACTION-OF-MACRO-KERNEL: for (int c2 = 0; c2 <= 10; c2 += 1) { -; EXTRACTION-OF-MACRO-KERNEL: // 1st level tiling - Points -; EXTRACTION-OF-MACRO-KERNEL: // Register tiling - Tiles -; EXTRACTION-OF-MACRO-KERNEL: for (int c3 = 0; c3 <= 3; c3 += 1) -; EXTRACTION-OF-MACRO-KERNEL: for (int c4 = 0; c4 <= 11; c4 += 1) -; EXTRACTION-OF-MACRO-KERNEL: for (int c5 = 0; c5 <= 255; c5 += 1) { -; EXTRACTION-OF-MACRO-KERNEL: // Register tiling - Points -; EXTRACTION-OF-MACRO-KERNEL: // 1st level tiling - Tiles -; EXTRACTION-OF-MACRO-KERNEL: // 1st level tiling - Points -; EXTRACTION-OF-MACRO-KERNEL: { -; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 1, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 2, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 3, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 4, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 5, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 6, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3, 96 * c2 + 8 * c4 + 7, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 1, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 2, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 3, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 4, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 5, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 6, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 1, 96 * c2 + 8 * c4 + 7, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 1, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 2, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 3, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 4, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 5, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 6, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 2, 96 * c2 + 8 * c4 + 7, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 1, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 2, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 3, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 4, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 5, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 6, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL: Stmt_bb24(16 * c0 + 4 * c3 + 3, 96 * c2 + 8 * c4 + 7, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL: } -; EXTRACTION-OF-MACRO-KERNEL: } -; EXTRACTION-OF-MACRO-KERNEL: } -; EXTRACTION-OF-MACRO-KERNEL: } +; EXTRACTION-OF-MACRO-KERNEL: { +; EXTRACTION-OF-MACRO-KERNEL-NEXT: // 1st level tiling - Tiles +; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c0 = 0; c0 <= 32; c0 += 1) +; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c1 = 0; c1 <= 32; c1 += 1) { +; EXTRACTION-OF-MACRO-KERNEL-NEXT: // 1st level tiling - Points +; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c2 = 0; c2 <= 31; c2 += 1) +; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c3 = 0; c3 <= 31; c3 += 1) +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb14(32 * c0 + c2, 32 * c1 + c3); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: } +; EXTRACTION-OF-MACRO-KERNEL-NEXT: // 1st level tiling - Tiles +; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c0 = 0; c0 <= 65; c0 += 1) +; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c1 = 0; c1 <= 3; c1 += 1) +; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c2 = 0; c2 <= 10; c2 += 1) { +; EXTRACTION-OF-MACRO-KERNEL-NEXT: // 1st level tiling - Points +; EXTRACTION-OF-MACRO-KERNEL-NEXT: // Register tiling - Tiles +; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c3 = 0; c3 <= 1; c3 += 1) +; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c4 = 0; c4 <= 23; c4 += 1) +; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c5 = 0; c5 <= 255; c5 += 1) { +; EXTRACTION-OF-MACRO-KERNEL-NEXT: // Register tiling - Points +; EXTRACTION-OF-MACRO-KERNEL-NEXT: // 1st level tiling - Tiles +; EXTRACTION-OF-MACRO-KERNEL-NEXT: // 1st level tiling - Points +; EXTRACTION-OF-MACRO-KERNEL-NEXT: { +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 16 * c0 + 8 * c3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 1, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 2, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 4, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 5, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 6, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 7, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 1, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 2, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 4, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 5, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 6, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 7, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 1, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 2, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 4, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 5, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 6, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 7, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 1, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 2, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 4, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 5, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 6, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 7, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: } +; EXTRACTION-OF-MACRO-KERNEL-NEXT: } +; EXTRACTION-OF-MACRO-KERNEL-NEXT: } +; EXTRACTION-OF-MACRO-KERNEL-NEXT: } ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown"