Index: lib/CodeGen/IslNodeBuilder.cpp =================================================================== --- lib/CodeGen/IslNodeBuilder.cpp +++ lib/CodeGen/IslNodeBuilder.cpp @@ -220,18 +220,20 @@ bool CreateScalarRefs) { auto &References = *static_cast(UserPtr); - if (Stmt->isBlockStmt()) - findReferencesInBlock(References, Stmt, Stmt->getBasicBlock()); - else { - assert(Stmt->isRegionStmt() && - "Stmt was neither block nor region statement"); - for (const BasicBlock *BB : Stmt->getRegion()->blocks()) - findReferencesInBlock(References, Stmt, BB); + if (!Stmt->isCopyStmt()) { + if (Stmt->isBlockStmt()) + findReferencesInBlock(References, Stmt, Stmt->getBasicBlock()); + else { + assert(Stmt->isRegionStmt() && + "Stmt was neither block nor region statement"); + for (const BasicBlock *BB : Stmt->getRegion()->blocks()) + findReferencesInBlock(References, Stmt, BB); + } } for (auto &Access : *Stmt) { - if (Access->isArrayKind()) { - auto *BasePtr = Access->getScopArrayInfo()->getBasePtr(); + if (Access->isLatestArrayKind()) { + auto *BasePtr = Access->getLatestScopArrayInfo()->getBasePtr(); if (Instruction *OpInst = dyn_cast(BasePtr)) if (Stmt->getParent()->contains(OpInst)) continue; Index: lib/External/isl/include/isl/schedule_node.h =================================================================== --- lib/External/isl/include/isl/schedule_node.h +++ lib/External/isl/include/isl/schedule_node.h @@ -149,6 +149,8 @@ __isl_keep isl_schedule_node *node); __isl_give isl_union_set *isl_schedule_node_domain_get_domain( __isl_keep isl_schedule_node *node); +__isl_give isl_schedule_node *isl_schedule_node_domain_union_domain( + __isl_take isl_schedule_node *node, __isl_take isl_union_set *domain); __isl_give isl_union_map *isl_schedule_node_expansion_get_expansion( __isl_keep isl_schedule_node *node); __isl_give isl_union_pw_multi_aff *isl_schedule_node_expansion_get_contraction( @@ -157,6 +159,8 @@ __isl_keep isl_schedule_node *node); __isl_give isl_union_set *isl_schedule_node_filter_get_filter( __isl_keep isl_schedule_node *node); +__isl_give isl_schedule_node *isl_schedule_node_filter_union_filter( + __isl_take isl_schedule_node *node, __isl_take isl_union_set *filter); __isl_give isl_set *isl_schedule_node_guard_get_guard( __isl_keep isl_schedule_node *node); __isl_give isl_id *isl_schedule_node_mark_get_id( Index: lib/External/isl/isl_schedule_node.c =================================================================== --- lib/External/isl/isl_schedule_node.c +++ lib/External/isl/isl_schedule_node.c @@ -2103,6 +2103,24 @@ return NULL; } +/* Union the filter of filter node "node" with "filter". + */ +__isl_give isl_schedule_node *isl_schedule_node_filter_union_filter( + __isl_take isl_schedule_node *node, __isl_take isl_union_set *filter) +{ + isl_union_set *node_filter = NULL; + if (!node || !filter) + goto error; + + node_filter = isl_schedule_tree_filter_get_filter(node->tree); + node_filter = isl_union_set_union(filter, node_filter); + return isl_schedule_node_filter_set_filter(node, node_filter); +error: + isl_schedule_node_free(node); + isl_union_set_free(filter); + return NULL; +} + /* Intersect the filter of filter node "node" with "filter". * * If the filter of the node is already a subset of "filter", @@ -3549,6 +3567,37 @@ return NULL; } +/* Union the domain of domain node "node" with "domain". +*/ +__isl_give isl_schedule_node *isl_schedule_node_domain_union_domain( + __isl_take isl_schedule_node *node, __isl_take isl_union_set *domain) +{ + isl_schedule_tree *tree; + isl_union_set *uset; + + if (!node || !domain || + !(isl_schedule_node_get_type(node) == isl_schedule_node_domain)) + goto error; + + tree = isl_schedule_tree_copy(node->tree); + uset = isl_schedule_tree_domain_get_domain(tree); + uset = isl_union_set_union(uset, domain); + tree = isl_schedule_tree_domain_set_domain(tree, + isl_union_set_copy(uset)); + node = isl_schedule_node_graft_tree(node, tree); + + node = isl_schedule_node_child(node, 0); + node = isl_schedule_node_gist(node, uset); + node = isl_schedule_node_parent(node); + + return node; +error: + isl_schedule_node_free(node); + isl_union_set_free(domain); + return NULL; +} + + /* Replace the domain of domain node "node" with the gist * of the original domain with respect to the parameter domain "context". */ Index: lib/External/isl/isl_schedule_node_private.h =================================================================== --- lib/External/isl/isl_schedule_node_private.h +++ lib/External/isl/isl_schedule_node_private.h @@ -54,6 +54,8 @@ __isl_give isl_schedule_node *isl_schedule_node_domain_intersect_domain( __isl_take isl_schedule_node *node, __isl_take isl_union_set *domain); +__isl_give isl_schedule_node *isl_schedule_node_domain_union_domain( + __isl_take isl_schedule_node *node, __isl_take isl_union_set *domain); __isl_give isl_schedule_node *isl_schedule_node_domain_gist_params( __isl_take isl_schedule_node *node, __isl_take isl_set *context); Index: lib/Transform/ScheduleOptimizer.cpp =================================================================== --- lib/Transform/ScheduleOptimizer.cpp +++ lib/Transform/ScheduleOptimizer.cpp @@ -1070,14 +1070,148 @@ return isl_map_apply_range(MapOldIndVar, AccessRel); } +namespace { +/// Access the array that could store all elements of the access relation +/// range. +/// +/// @param AccRel The access relation to be modified. +/// @param ArrayName The name of the array that should be created. +/// @param ElemType The type of elements of the array that should be created. +/// @return The modified access relation. +__isl_give isl_map *getAccessToArray(__isl_take isl_map *AccRel, + const char *ArrayName, Type *ElemType) { + auto InputDimsId = isl_map_get_tuple_id(AccRel, isl_dim_in); + auto *Stmt = static_cast(isl_id_get_user(InputDimsId)); + isl_id_free(InputDimsId); + auto *AccRelRange = isl_set_lexmax(isl_map_range(isl_map_copy(AccRel))); + assert(isl_set_is_singleton(AccRelRange) && + "The range of the access relation should have fixed bounds."); + auto *DimBound = isl_set_plain_get_val_if_fixed(AccRelRange, isl_dim_set, 0); + unsigned FirstDimSize = isl_val_get_num_si(DimBound) + 1; + isl_val_free(DimBound); + DimBound = isl_set_plain_get_val_if_fixed(AccRelRange, isl_dim_set, 1); + unsigned SecondDimSize = isl_val_get_num_si(DimBound) + 1; + isl_val_free(DimBound); + DimBound = isl_set_plain_get_val_if_fixed(AccRelRange, isl_dim_set, 2); + unsigned ThirdDimSize = isl_val_get_num_si(DimBound) + 1; + isl_val_free(DimBound); + isl_set_free(AccRelRange); + auto *SAI = Stmt->getParent()->createScopArrayInfo( + ElemType, ArrayName, {FirstDimSize, SecondDimSize, ThirdDimSize}); + return isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId()); +} + +/// Create and add the copy statement to the SCoP +/// +/// @param ReadAcc Memory access that reads from the memory. +/// @param WriteAccRel The access relation to be used to write to memory. +/// @param UnusedDim The dimension of the SCoP domain that is not used for +/// the copying. +/// @return The newly created copy statement. +ScopStmt *addCopyStmt(MemoryAccess *ReadAcc, __isl_take isl_map *WriteAccRel, + unsigned UnusedDim) { + auto *ReadAccRel = ReadAcc->getAccessRelation(); + auto InputDimsId = isl_map_get_tuple_id(ReadAccRel, isl_dim_in); + auto *Stmt = static_cast(isl_id_get_user(InputDimsId)); + isl_id_free(InputDimsId); + ReadAcc->setNewAccessRelation(isl_map_copy(WriteAccRel)); + auto *Domain = isl_set_fix_si(Stmt->getDomain(), isl_dim_set, UnusedDim, 0); + return Stmt->getParent()->addScopStmt(ReadAccRel, WriteAccRel, Domain); +} + +/// Union the node domain and @p UnionSet. +/// +/// @param Node The node with the domain to be modified. +/// @param UnionSet The union set to be united with the node domain. +/// @return The node with modified domain. __isl_give isl_schedule_node * -createExtensionNode(__isl_take isl_schedule_node *Node, - __isl_take isl_map *ExtensionMap) { - auto *Extension = isl_union_map_from_map(ExtensionMap); - auto *NewNode = isl_schedule_node_from_extension(Extension); - return isl_schedule_node_graft_before(Node, NewNode); +unionDomainNodeDomain(__isl_take isl_schedule_node *Node, + __isl_take isl_union_set *UnionSet) { + auto *ScheduleDomain = isl_schedule_node_get_domain(Node); + Node = isl_schedule_node_root(Node); + Node = + isl_schedule_node_domain_union_domain(Node, isl_union_set_copy(UnionSet)); + Node = isl_schedule_node_child(isl_schedule_node_root(Node), 0); + if (isl_schedule_node_get_type(Node) == isl_schedule_node_sequence) { + for (int i = 0; i < isl_schedule_node_n_children(Node); i++) { + Node = isl_schedule_node_child(Node, i); + auto *Filter = isl_schedule_node_filter_get_filter(Node); + if (isl_union_set_is_subset(ScheduleDomain, Filter)) { + Filter = isl_union_set_union(Filter, isl_union_set_copy(UnionSet)); + Node = isl_schedule_node_filter_union_filter(Node, Filter); + Node = isl_schedule_node_child(Node, 0); + break; + } + isl_union_set_free(Filter); + Node = isl_schedule_node_parent(Node); + } + } + isl_union_set_free(UnionSet); + isl_union_set_free(ScheduleDomain); + return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0); } +/// Insert the childless filter node. +/// +/// @param Node The node to be modified. +/// @param Filter The union set to be used as a filter. +/// @return The child of the newly created filter node. +__isl_give isl_schedule_node * +insertChildlessFilterNode(__isl_take isl_schedule_node *Node, + __isl_take isl_union_set_list *Filter) { + Node = isl_schedule_node_insert_sequence(Node, Filter); + Node = isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0); + while (isl_schedule_node_get_type(Node) != isl_schedule_node_leaf) + isl_schedule_node_delete(Node); + return isl_schedule_node_parent(isl_schedule_node_parent(Node)); +} + +/// Get the band dimensions of the optimized GEMM. +/// +/// @param CopyStmtA The copy statement that accesses the packed array A. +/// @param CopyStmtB The copy statement that accesses the packed array B. +/// @param Pos The position of the first dimension that should not be +/// considered. +/// @param Number The number of dimensions that should not be considered. +/// @param Node The position of 1st level tiles. +/// @return The desired band dimensions. +__isl_give isl_multi_union_pw_aff * +getMatMulBandDimensions(ScopStmt *CopyStmtA, ScopStmt *CopyStmtB, unsigned Pos, + unsigned Number, __isl_take isl_schedule_node *Node) { + assert((isl_schedule_node_get_type(Node) == isl_schedule_node_band) && + "Subsequently examine the dimensions."); + auto *PartialSchedule = + isl_schedule_node_band_get_partial_schedule_union_map(Node); + assert((isl_union_map_n_map(PartialSchedule) == 1) && + "The node contains dimensions of the single SCoP statement."); + auto *PartialScheduleMap = isl_map_from_union_map(PartialSchedule); + assert((isl_map_dim(PartialScheduleMap, isl_dim_out) >= Pos + Number) && + "Operate on the output dimensions of the map."); + auto *CopyARel = isl_map_set_tuple_id(isl_map_copy(PartialScheduleMap), + isl_dim_in, CopyStmtA->getDomainId()); + auto *CopyBRel = isl_map_set_tuple_id(isl_map_copy(PartialScheduleMap), + isl_dim_in, CopyStmtB->getDomainId()); + CopyARel = + isl_map_drop_constraints_involving_dims(CopyARel, isl_dim_out, 0, 1); + CopyARel = isl_map_fix_si(CopyARel, isl_dim_out, 0, 0); + CopyBRel = + isl_map_drop_constraints_involving_dims(CopyBRel, isl_dim_out, 2, 1); + CopyBRel = isl_map_fix_si(CopyBRel, isl_dim_out, 2, 0); + CopyARel = isl_map_project_out(CopyARel, isl_dim_out, Pos, Number); + CopyBRel = isl_map_project_out(CopyBRel, isl_dim_out, Pos, Number); + auto *CopyADimensions = isl_union_map_from_map(CopyARel); + auto *CopyBDimensions = isl_union_map_from_map(CopyBRel); + auto *UnionOfDimensions = + isl_union_map_union(CopyADimensions, CopyBDimensions); + PartialScheduleMap = + isl_map_project_out(PartialScheduleMap, isl_dim_out, Pos, Number); + auto *PartialScheduleUnionMap = isl_union_map_from_map(PartialScheduleMap); + UnionOfDimensions = + isl_union_map_union(PartialScheduleUnionMap, UnionOfDimensions); + return isl_multi_union_pw_aff_from_union_map(UnionOfDimensions); +} +} // namespace + /// Apply the packing transformation. /// /// The packing transformation can be described as a data-layout @@ -1114,68 +1248,33 @@ __isl_take isl_schedule_node *Node, __isl_take isl_map *MapOldIndVar, MicroKernelParamsTy MicroParams, MacroKernelParamsTy MacroParams, MatMulInfoTy &MMI) { - auto InputDimsId = isl_map_get_tuple_id(MapOldIndVar, isl_dim_in); - auto *Stmt = static_cast(isl_id_get_user(InputDimsId)); - isl_id_free(InputDimsId); - - // Create a copy statement that corresponds to the memory access to the - // matrix B, the second operand of the matrix multiplication. - Node = isl_schedule_node_parent(isl_schedule_node_parent(Node)); - Node = isl_schedule_node_parent(isl_schedule_node_parent(Node)); - Node = isl_schedule_node_parent(Node); - Node = isl_schedule_node_child(isl_schedule_node_band_split(Node, 2), 0); auto *AccRel = getMatMulAccRel(isl_map_copy(MapOldIndVar), 3, 7); - unsigned FirstDimSize = MacroParams.Nc / MicroParams.Nr; - unsigned SecondDimSize = MacroParams.Kc; - unsigned ThirdDimSize = MicroParams.Nr; - auto *SAI = Stmt->getParent()->createScopArrayInfo( - MMI.B->getElementType(), "Packed_B", - {FirstDimSize, SecondDimSize, ThirdDimSize}); - AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId()); - auto *OldAcc = MMI.B->getAccessRelation(); - MMI.B->setNewAccessRelation(AccRel); - auto *ExtMap = - isl_map_project_out(isl_map_copy(MapOldIndVar), isl_dim_out, 2, - isl_map_dim(MapOldIndVar, isl_dim_out) - 2); - ExtMap = isl_map_reverse(ExtMap); - ExtMap = isl_map_fix_si(ExtMap, isl_dim_out, MMI.i, 0); - auto *Domain = Stmt->getDomain(); - - // Restrict the domains of the copy statements to only execute when also its - // originating statement is executed. - auto *DomainId = isl_set_get_tuple_id(Domain); - auto *NewStmt = Stmt->getParent()->addScopStmt( - OldAcc, MMI.B->getAccessRelation(), isl_set_copy(Domain)); - ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, isl_id_copy(DomainId)); - ExtMap = isl_map_intersect_range(ExtMap, isl_set_copy(Domain)); - ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId()); - Node = createExtensionNode(Node, ExtMap); - - // Create a copy statement that corresponds to the memory access - // to the matrix A, the first operand of the matrix multiplication. + AccRel = getAccessToArray(AccRel, "Packed_B", MMI.B->getElementType()); + auto *StmtB = addCopyStmt(MMI.B, AccRel, MMI.i); + AccRel = getAccessToArray(getMatMulAccRel(MapOldIndVar, 4, 6), "Packed_A", + MMI.A->getElementType()); + auto *StmtA = addCopyStmt(MMI.A, AccRel, MMI.j); + auto *StmtADomain = isl_union_set_from_set(StmtA->getDomain()); + auto *StmtBDomain = isl_union_set_from_set(StmtB->getDomain()); + auto *Domain = isl_union_set_union(isl_union_set_copy(StmtADomain), + isl_union_set_copy(StmtBDomain)); + Node = unionDomainNodeDomain(Node, Domain); + auto *Stage1Dimensions = getMatMulBandDimensions(StmtA, StmtB, 2, 1, Node); + auto *Stage2Dimensions = getMatMulBandDimensions(StmtA, StmtB, 0, 2, Node); + Node = isl_schedule_node_delete(Node); + Node = isl_schedule_node_insert_partial_schedule(Node, Stage1Dimensions); + Node = isl_schedule_node_child(Node, 0); + auto *ScheduleDomain = isl_schedule_node_get_domain(Node); + auto *Filter = isl_union_set_list_from_union_set(StmtBDomain); + Filter = isl_union_set_list_add(Filter, isl_union_set_copy(ScheduleDomain)); + Node = insertChildlessFilterNode(Node, Filter); + Node = isl_schedule_node_child(isl_schedule_node_child(Node, 1), 0); + Node = isl_schedule_node_insert_partial_schedule(Node, Stage2Dimensions); Node = isl_schedule_node_child(Node, 0); - AccRel = getMatMulAccRel(isl_map_copy(MapOldIndVar), 4, 6); - FirstDimSize = MacroParams.Mc / MicroParams.Mr; - ThirdDimSize = MicroParams.Mr; - SAI = Stmt->getParent()->createScopArrayInfo( - MMI.A->getElementType(), "Packed_A", - {FirstDimSize, SecondDimSize, ThirdDimSize}); - AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId()); - OldAcc = MMI.A->getAccessRelation(); - MMI.A->setNewAccessRelation(AccRel); - ExtMap = isl_map_project_out(MapOldIndVar, isl_dim_out, 3, - isl_map_dim(MapOldIndVar, isl_dim_out) - 3); - ExtMap = isl_map_reverse(ExtMap); - ExtMap = isl_map_fix_si(ExtMap, isl_dim_out, MMI.j, 0); - NewStmt = Stmt->getParent()->addScopStmt(OldAcc, MMI.A->getAccessRelation(), - isl_set_copy(Domain)); - - // Restrict the domains of the copy statements to only execute when also its - // originating statement is executed. - ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, DomainId); - ExtMap = isl_map_intersect_range(ExtMap, Domain); - ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId()); - Node = createExtensionNode(Node, ExtMap); + Filter = isl_union_set_list_from_union_set(StmtADomain); + Filter = isl_union_set_list_add(Filter, ScheduleDomain); + Node = insertChildlessFilterNode(Node, Filter); + Node = isl_schedule_node_child(isl_schedule_node_child(Node, 1), 0); Node = isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0); return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0); } @@ -1322,10 +1421,10 @@ Node, MicroKernelParams, MacroKernelParams); if (!MapOldIndVar) return Node; - Node = - isolateAndUnrollMatMulInnerLoops(give(Node), MicroKernelParams).release(); - return optimizeDataLayoutMatrMulPattern(Node, MapOldIndVar, MicroKernelParams, + Node = optimizeDataLayoutMatrMulPattern(Node, MapOldIndVar, MicroKernelParams, MacroKernelParams, MMI); + return isolateAndUnrollMatMulInnerLoops(give(Node), MicroKernelParams) + .release(); } bool ScheduleTreeOptimizer::isMatrMultPattern( @@ -1591,6 +1690,8 @@ S.setScheduleTree(NewSchedule); S.markAsOptimized(); + getAnalysis().releaseMemory(); + if (OptimizedScops) S.dump(); Index: test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll =================================================================== --- test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll +++ test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll @@ -29,9 +29,9 @@ ; ; CHECK: CopyStmt_0 ; CHECK-NEXT: Domain := -; CHECK-NEXT: { CopyStmt_0[i0, i1, i2] : 0 <= i0 <= 1055 and 0 <= i1 <= 1055 and 0 <= i2 <= 1023 }; +; CHECK-NEXT: { CopyStmt_0[0, i1, i2] : 0 <= i1 <= 1055 and 0 <= i2 <= 1023 }; ; CHECK-NEXT: Schedule := -; CHECK-NEXT: ; +; CHECK-NEXT: { CopyStmt_0[i0, i1, i2] -> [1, 0, o2, 0, 0, 0, 0, 0, 0, 0, 0, 0] : -255 + i2 <= 256o2 <= i2 }; ; CHECK-NEXT: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0] ; CHECK-NEXT: null; ; CHECK-NEXT: new: { CopyStmt_0[i0, i1, i2] -> Packed_B[o0, o1, o2] : 256*floor((-i2 + o1)/256) = -i2 + o1 and 8*floor((-i1 + o2)/8) = -i1 + o2 and 0 <= o1 <= 255 and 0 <= o2 <= 7 and -7 + i1 - 8o0 <= 2048*floor((i1)/2048) <= i1 - 8o0 }; @@ -40,9 +40,9 @@ ; CHECK-NEXT: new: { CopyStmt_0[i0, i1, i2] -> MemRef_arg7[i2, i1] }; ; CHECK-NEXT: CopyStmt_1 ; CHECK-NEXT: Domain := -; CHECK-NEXT: { CopyStmt_1[i0, i1, i2] : 0 <= i0 <= 1055 and 0 <= i1 <= 1055 and 0 <= i2 <= 1023 }; +; CHECK-NEXT: { CopyStmt_1[i0, 0, i2] : 0 <= i0 <= 1055 and 0 <= i2 <= 1023 }; ; CHECK-NEXT: Schedule := -; CHECK-NEXT: ; +; CHECK-NEXT: { CopyStmt_1[i0, i1, i2] -> [1, 0, o2, 1, o4, 0, 0, 0, 0, 0, 0, 0] : -255 + i2 <= 256o2 <= i2 and -95 + i0 <= 96o4 <= i0 }; ; CHECK-NEXT: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0] ; CHECK-NEXT: null; ; CHECK-NEXT: new: { CopyStmt_1[i0, i1, i2] -> Packed_A[o0, o1, o2] : 256*floor((-i2 + o1)/256) = -i2 + o1 and 4*floor((-i0 + o2)/4) = -i0 + o2 and 0 <= o1 <= 255 and 0 <= o2 <= 3 and -3 + i0 - 4o0 <= 96*floor((i0)/96) <= i0 - 4o0 }; Index: test/ScheduleOptimizer/pattern-matching-based-opts_11.ll =================================================================== --- /dev/null +++ test/ScheduleOptimizer/pattern-matching-based-opts_11.ll @@ -0,0 +1,141 @@ +; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \ +; RUN: -polly-target-throughput-vector-fma=1 \ +; RUN: -polly-target-latency-vector-fma=8 \ +; RUN: -analyze -polly-ast -polly-target-1st-cache-level-associativity=8 \ +; RUN: -polly-target-2nd-cache-level-associativity=8 \ +; RUN: -polly-target-1st-cache-level-size=32768 \ +; RUN: -polly-target-vector-register-bitwidth=256 \ +; RUN: -polly-target-2nd-cache-level-size=262144 -polly-parallel < %s 2>&1 \ +; RUN: | FileCheck %s +; +; /* C := alpha*A*B + beta*C */ +; for (i = 0; i < _PB_NI; i++) +; for (j = 0; j < _PB_NJ; j++) +; { +; C[i][j] *= beta; +; for (k = 0; k < _PB_NK; ++k) +; C[i][j] += alpha * A[i][k] * B[k][j]; +; } +; +; This test case checks whether Polly detects the expected parallel loops +; in case of GEMM and the pattern matching optimizations. +; +; CHECK: // 1st level tiling - Tiles +; CHECK-NEXT: #pragma omp parallel for +; CHECK-NEXT: for (int c0 = 0; c0 <= 32; c0 += 1) +; CHECK-NEXT: for (int c1 = 0; c1 <= 32; c1 += 1) { +; CHECK-NEXT: // 1st level tiling - Points +; CHECK-NEXT: for (int c2 = 0; c2 <= 31; c2 += 1) +; CHECK-NEXT: #pragma simd +; CHECK-NEXT: for (int c3 = 0; c3 <= 31; c3 += 1) +; CHECK-NEXT: Stmt_bb9(32 * c0 + c2, 32 * c1 + c3); +; CHECK-NEXT: } +; CHECK-NEXT: // Inter iteration alias-free +; CHECK-NEXT: // 1st level tiling - Tiles +; CHECK-NEXT: #pragma minimal dependence distance: 1 +; CHECK-NEXT: for (int c1 = 0; c1 <= 3; c1 += 1) { +; CHECK-NEXT: #pragma omp parallel for +; CHECK-NEXT: for (int c3 = 0; c3 <= 1055; c3 += 1) +; CHECK-NEXT: #pragma simd +; CHECK-NEXT: for (int c4 = 256 * c1; c4 <= 256 * c1 + 255; c4 += 1) +; CHECK-NEXT: CopyStmt_0(0, c3, c4); +; CHECK-NEXT: #pragma minimal dependence distance: 1 +; CHECK-NEXT: for (int c2 = 0; c2 <= 10; c2 += 1) { +; CHECK-NEXT: #pragma omp parallel for +; CHECK-NEXT: for (int c3 = 96 * c2; c3 <= 96 * c2 + 95; c3 += 1) +; CHECK-NEXT: #pragma simd +; CHECK-NEXT: for (int c5 = 256 * c1; c5 <= 256 * c1 + 255; c5 += 1) +; CHECK-NEXT: CopyStmt_1(c3, 0, c5); +; CHECK-NEXT: // 1st level tiling - Points +; CHECK-NEXT: // Register tiling - Tiles +; CHECK-NEXT: #pragma omp parallel for +; CHECK-NEXT: for (int c3 = 0; c3 <= 131; c3 += 1) +; CHECK-NEXT: for (int c4 = 0; c4 <= 23; c4 += 1) +; CHECK-NEXT: #pragma minimal dependence distance: 1 +; CHECK-NEXT: for (int c5 = 0; c5 <= 255; c5 += 1) { +; CHECK-NEXT: // Register tiling - Points +; CHECK-NEXT: { +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 1, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 2, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 4, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 5, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 6, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 7, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 1, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 2, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 4, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 5, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 6, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 7, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 1, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 2, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 4, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 5, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 6, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 7, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 1, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 2, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 4, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 5, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 6, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 7, 256 * c1 + c5); +; CHECK-NEXT: } +; CHECK-NEXT: } +; CHECK-NEXT: } +; CHECK-NEXT: } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-unknown" + +define internal void @kernel_gemm(i32 %arg, i32 %arg1, i32 %arg2, double %arg3, double %arg4, [1056 x double]* %arg5, [1024 x double]* %arg6, [1056 x double]* %arg7) #0 { +bb: + br label %bb8 + +bb8: ; preds = %bb29, %bb + %tmp = phi i64 [ 0, %bb ], [ %tmp30, %bb29 ] + br label %bb9 + +bb9: ; preds = %bb26, %bb8 + %tmp10 = phi i64 [ 0, %bb8 ], [ %tmp27, %bb26 ] + %tmp11 = getelementptr inbounds [1056 x double], [1056 x double]* %arg5, i64 %tmp, i64 %tmp10 + %tmp12 = load double, double* %tmp11, align 8 + %tmp13 = fmul double %tmp12, %arg4 + store double %tmp13, double* %tmp11, align 8 + br label %Copy_0 + +Copy_0: ; preds = %Copy_0, %bb9 + %tmp15 = phi i64 [ 0, %bb9 ], [ %tmp24, %Copy_0 ] + %tmp16 = getelementptr inbounds [1024 x double], [1024 x double]* %arg6, i64 %tmp, i64 %tmp15 + %tmp17 = load double, double* %tmp16, align 8 + %tmp18 = fmul double %tmp17, %arg3 + %tmp19 = getelementptr inbounds [1056 x double], [1056 x double]* %arg7, i64 %tmp15, i64 %tmp10 + %tmp20 = load double, double* %tmp19, align 8 + %tmp21 = fmul double %tmp18, %tmp20 + %tmp22 = load double, double* %tmp11, align 8 + %tmp23 = fadd double %tmp22, %tmp21 + store double %tmp23, double* %tmp11, align 8 + %tmp24 = add nuw nsw i64 %tmp15, 1 + %tmp25 = icmp ne i64 %tmp24, 1024 + br i1 %tmp25, label %Copy_0, label %bb26 + +bb26: ; preds = %Copy_0 + %tmp27 = add nuw nsw i64 %tmp10, 1 + %tmp28 = icmp ne i64 %tmp27, 1056 + br i1 %tmp28, label %bb9, label %bb29 + +bb29: ; preds = %bb26 + %tmp30 = add nuw nsw i64 %tmp, 1 + %tmp31 = icmp ne i64 %tmp30, 1056 + br i1 %tmp31, label %bb8, label %bb32 + +bb32: ; preds = %bb29 + ret void +}