Index: include/polly/ScheduleOptimizer.h =================================================================== --- include/polly/ScheduleOptimizer.h +++ include/polly/ScheduleOptimizer.h @@ -12,6 +12,7 @@ #ifndef POLLY_SCHEDULE_OPTIMIZER_H #define POLLY_SCHEDULE_OPTIMIZER_H +#include "polly/DependenceInfo.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "isl/ctx.h" @@ -42,6 +43,31 @@ }; namespace polly { +/// Additional parameters of the schedule optimizer. +/// +/// Target Transform Info and the SCoP dependencies used by the schedule +/// optimizer. +/// +struct OptimizerAdditionalInfoTy { + const llvm::TargetTransformInfo *TTI; + const Dependences *D; +}; + +/// Parameters of the matrix multiplication operands. +/// +/// Parameters, which describe access relations that represent operands of the +/// matrix multiplication. +/// +struct MatMulInfoTy { + MemoryAccess *A = nullptr; + MemoryAccess *B = nullptr; + MemoryAccess *ReadFromC = nullptr; + MemoryAccess *WriteToC = nullptr; + int i = -1; + int j = -1; + int k = -1; +}; + extern bool DisablePollyTiling; class Scop; } // namespace polly @@ -59,11 +85,11 @@ /// /// @param Schedule The schedule object the transformations will be applied /// to. - /// @param TTI Target Transform Info. + /// @param OAI Target Transform Info and the SCoP dependencies. /// @returns The transformed schedule. static __isl_give isl_schedule * optimizeSchedule(__isl_take isl_schedule *Schedule, - const llvm::TargetTransformInfo *TTI = nullptr); + const polly::OptimizerAdditionalInfoTy *OAI = nullptr); /// Apply schedule tree transformations. /// @@ -75,11 +101,11 @@ /// - Prevectorization /// /// @param Node The schedule object post-transformations will be applied to. - /// @param TTI Target Transform Info. + /// @param OAI Target Transform Info and the SCoP dependencies. /// @returns The transformed schedule. static __isl_give isl_schedule_node * optimizeScheduleNode(__isl_take isl_schedule_node *Node, - const llvm::TargetTransformInfo *TTI = nullptr); + const polly::OptimizerAdditionalInfoTy *OAI = nullptr); /// Decide if the @p NewSchedule is profitable for @p S. /// @@ -128,10 +154,11 @@ /// Apply the BLIS matmul optimization pattern. /// - /// Apply the BLIS matmul optimization pattern. BLIS implements gemm as three - /// nested loops around a macro-kernel, plus two packing routines. - /// The macro-kernel is implemented in terms of two additional loops around - /// a micro-kernel. The micro-kernel is a loop around a rank-1 + /// Make the loops containing the matrix maltiplication be the innermost + /// loops and apply the BLIS matmul optimization pattern. BLIS implements + /// gemm as three nested loops around a macro-kernel, plus two packing + /// routines. The macro-kernel is implemented in terms of two additional + /// loops around a micro-kernel. The micro-kernel is a loop around a rank-1 /// (i.e., outer product) update. /// /// For a detailed description please see [1]. @@ -167,9 +194,13 @@ /// @param Node The node that contains a band to be optimized. The node /// is required to successfully pass /// ScheduleTreeOptimizer::isMatrMultPattern. + /// @param TTI Target Transform Info. + /// @param MMI Parameters of the matrix multiplication operands. + /// @returns The transformed schedule. static __isl_give isl_schedule_node * optimizeMatMulPattern(__isl_take isl_schedule_node *Node, - const llvm::TargetTransformInfo *TTI); + const llvm::TargetTransformInfo *TTI, + polly::MatMulInfoTy &MMI); /// Check if this node is a band node we want to tile. /// @@ -266,7 +297,11 @@ /// the one used to get close-to-peak performance of matrix multiplications. /// /// @param Node The node to check. - static bool isMatrMultPattern(__isl_keep isl_schedule_node *Node); + /// @param D The SCoP dependencies. + /// @param MMI Parameters of the matrix multiplication operands. + static bool isMatrMultPattern(__isl_keep isl_schedule_node *Node, + const polly::Dependences *D, + polly::MatMulInfoTy &MMI); /// Create the BLIS macro-kernel. /// Index: lib/Transform/ScheduleOptimizer.cpp =================================================================== --- lib/Transform/ScheduleOptimizer.cpp +++ lib/Transform/ScheduleOptimizer.cpp @@ -468,26 +468,302 @@ return Node; } -/// Check whether output dimensions of the map rely on the specified input -/// dimension. +/// Get the position of a dimension with a non-zero coefficient. /// -/// @param IslMap The isl map to be considered. -/// @param DimNum The number of an input dimension to be checked. -static bool isInputDimUsed(__isl_take isl_map *IslMap, unsigned DimNum) { - auto *CheckedAccessRelation = - isl_map_project_out(isl_map_copy(IslMap), isl_dim_in, DimNum, 1); - CheckedAccessRelation = - isl_map_insert_dims(CheckedAccessRelation, isl_dim_in, DimNum, 1); - auto *InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_in); - CheckedAccessRelation = - isl_map_set_tuple_id(CheckedAccessRelation, isl_dim_in, InputDimsId); - InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_out); - CheckedAccessRelation = - isl_map_set_tuple_id(CheckedAccessRelation, isl_dim_out, InputDimsId); - auto res = !isl_map_is_equal(CheckedAccessRelation, IslMap); - isl_map_free(CheckedAccessRelation); - isl_map_free(IslMap); - return res; +/// Check that isl constraint @p Constraint has the only one non-zero +/// coefficient for dimensions that have type @p DimType. If this is true, +/// return the position of the dimension corresponding to the non-zero +/// coefficient and negative value, otherwise. +/// +/// @param Constraint The isl constraint to be checked. +/// @param DimType The type of the dimensions. +/// @return The position of the dimension in case the isl +/// constraint satisfies the requirements, a negative +/// value, otherwise. +static int getMatMulConstraintDim(__isl_keep isl_constraint *Constraint, + enum isl_dim_type DimType) { + int DimPos = -1; + auto *LocalSpace = isl_constraint_get_local_space(Constraint); + int LocalSpaceDimNum = isl_local_space_dim(LocalSpace, DimType); + for (int i = 0; i < LocalSpaceDimNum; i++) { + auto *Val = isl_constraint_get_coefficient_val(Constraint, DimType, i); + if (isl_val_is_zero(Val)) { + isl_val_free(Val); + continue; + } + if (DimPos >= 0 || (DimType == isl_dim_out && !isl_val_is_one(Val)) || + (DimType == isl_dim_in && !isl_val_is_negone(Val))) { + isl_val_free(Val); + isl_local_space_free(LocalSpace); + return -1; + } + DimPos = i; + isl_val_free(Val); + } + isl_local_space_free(LocalSpace); + return DimPos; +} + +/// Check the form of the isl constraint. +/// +/// Check that the @p DimInPos input dimension of the isl constraint +/// @p Constraint has a coefficient that is equal to negative one, the @p +/// DimOutPos has a coefficient that is equal to one and others +/// have coefficients equal to zero. +/// +/// @param Constraint The isl constraint to be checked. +/// @param DimInPos The input dimension of the isl constraint. +/// @param DimOutPos The output dimension of the isl constraint. +/// @return isl_stat_ok in case the isl constraint satisfies +/// the requirements, isl_stat_error otherwise. +static isl_stat isMatMulOperandConstraint(__isl_keep isl_constraint *Constraint, + int &DimInPos, int &DimOutPos) { + auto *Val = isl_constraint_get_constant_val(Constraint); + if (!isl_constraint_is_equality(Constraint) || !isl_val_is_zero(Val)) { + isl_val_free(Val); + return isl_stat_error; + } + isl_val_free(Val); + DimInPos = getMatMulConstraintDim(Constraint, isl_dim_in); + if (DimInPos < 0) + return isl_stat_error; + DimOutPos = getMatMulConstraintDim(Constraint, isl_dim_out); + if (DimOutPos < 0) + return isl_stat_error; + return isl_stat_ok; +} + +/// Check that the access relation corresponds to a non-constant operand +/// of the matrix multiplication. +/// +/// Access relations that correspond to non-constant operands of the matrix +/// multiplication depend only on two input dimensions and have two output +/// dimensions. The function checks that the isl basic map @p bmap satisfies +/// the requirements. The two input dimensions can be specified via @p user +/// array. +/// +/// @param bmap The isl basic map to be checked. +/// @param user The input dimensions of @p bmap. +/// @return isl_stat_ok in case isl basic map satisfies the requirements, +/// isl_stat_error otherwise. +static isl_stat isMatMulOperandBasicMap(__isl_take isl_basic_map *bmap, + void *user) { + auto *Constraints = isl_basic_map_get_constraint_list(bmap); + isl_basic_map_free(bmap); + if (isl_constraint_list_n_constraint(Constraints) != 2) { + isl_constraint_list_free(Constraints); + return isl_stat_error; + } + int InPosPair[] = {-1, -1}; + auto DimInPos = user ? static_cast(user) : InPosPair; + for (int i = 0; i < 2; i++) { + auto *Constraint = isl_constraint_list_get_constraint(Constraints, i); + int InPos, OutPos; + if (isMatMulOperandConstraint(Constraint, InPos, OutPos) == + isl_stat_error || + OutPos > 1 || (DimInPos[OutPos] >= 0 && DimInPos[OutPos] != InPos)) { + isl_constraint_free(Constraint); + isl_constraint_list_free(Constraints); + return isl_stat_error; + } + DimInPos[OutPos] = InPos; + isl_constraint_free(Constraint); + } + isl_constraint_list_free(Constraints); + return isl_stat_ok; +} + +/// Permute the two dimensions of the isl map. +/// +/// Permute @p DstPos and @p SrcPos dimensions of the isl map @p Map that +/// have type @p DimType. +/// +/// @param Map The isl map to be modified. +/// @param DimType The type of the dimensions. +/// @param DstPos The first dimension. +/// @param SrcPos The second dimension. +/// @return The modified map. +__isl_give isl_map *permuteDimensions(__isl_take isl_map *Map, + enum isl_dim_type DimType, + unsigned DstPos, unsigned SrcPos) { + assert(DstPos < isl_map_dim(Map, DimType) && + SrcPos < isl_map_dim(Map, DimType)); + if (DstPos == SrcPos) + return Map; + isl_id *DimId = nullptr; + if (isl_map_has_tuple_id(Map, DimType)) + DimId = isl_map_get_tuple_id(Map, DimType); + auto FreeDim = DimType == isl_dim_in ? isl_dim_out : isl_dim_in; + auto MaxDim = std::max(DstPos, SrcPos); + auto MinDim = std::min(DstPos, SrcPos); + Map = isl_map_move_dims(Map, FreeDim, 0, DimType, MaxDim, 1); + Map = isl_map_move_dims(Map, FreeDim, 0, DimType, MinDim, 1); + Map = isl_map_move_dims(Map, DimType, MinDim, FreeDim, 1, 1); + Map = isl_map_move_dims(Map, DimType, MaxDim, FreeDim, 0, 1); + if (DimId) + Map = isl_map_set_tuple_id(Map, DimType, DimId); + return Map; +} + +/// Check accesses to non-scalar operands of the matrix multiplication. +/// +/// Check that an access of the SCoP statement, which corresponds to +/// the partial schedule @p PartialSchedule, is scalar in terms of loops +/// containing the matrix multiplication, in case it does not represent +/// an access to the non-scalar operands of the matrix multiplication or +/// its result. +/// +/// @param PartialSchedule The partial schedule of the SCoP statement. +/// @param MMI Parameters of the matrix multiplication operands. +/// @return True in case the corresponding SCoP statement +/// represents matrix multiplication and false, +/// otherwise. +static bool containsOnlyMatrMultAcc(__isl_keep isl_map *PartialSchedule, + const MatMulInfoTy &MMI) { + auto *InputDimId = isl_map_get_tuple_id(PartialSchedule, isl_dim_in); + auto *Stmt = static_cast(isl_id_get_user(InputDimId)); + isl_id_free(InputDimId); + unsigned OutDimNum = isl_map_dim(PartialSchedule, isl_dim_out); + assert(OutDimNum > 2 && "In case of the matrix multiplication the loop nest " + "and, consequently, the corresponding scheduling " + "functions have at least three dimensions."); + auto *MapI = permuteDimensions(isl_map_copy(PartialSchedule), isl_dim_out, + MMI.i, OutDimNum - 1); + auto *MapJ = permuteDimensions(isl_map_copy(PartialSchedule), isl_dim_out, + MMI.j, OutDimNum - 1); + auto *MapK = permuteDimensions(isl_map_copy(PartialSchedule), isl_dim_out, + MMI.k, OutDimNum - 1); + for (auto *MemA = Stmt->begin(); MemA != Stmt->end() - 1; MemA++) { + auto *MemAccessPtr = *MemA; + if (MemAccessPtr->isArrayKind() && MemAccessPtr != MMI.A && + MemAccessPtr != MMI.B && MemAccessPtr != MMI.ReadFromC && + MemAccessPtr != MMI.WriteToC && + !(MemAccessPtr->isStrideZero(isl_map_copy(MapI)) && + MemAccessPtr->isStrideZero(isl_map_copy(MapJ)) && + MemAccessPtr->isStrideZero(isl_map_copy(MapK)))) { + isl_map_free(MapI); + isl_map_free(MapJ); + isl_map_free(MapK); + return false; + } + } + isl_map_free(MapI); + isl_map_free(MapJ); + isl_map_free(MapK); + return true; +} + +/// Check for dependencies corresponding to the matrix multiplication. +/// +/// Check that there is only true dependence of the form +/// S(..., k, ...) → S(..., k + 1, …), where S is the SCoP statement +/// represented by @p Schedule and k is @p Pos. Such a dependence corresponds +/// to the dependency produced by the matrix multiplication. +/// +/// @param Schedule The schedule of the SCoP statement. +/// @param D The SCoP dependencies. +/// @param Pos The parameter to desribe an acceptable true dependence. +/// @return True in case dependencies correspond to the matrix multiplication +/// and false, otherwise. +static bool containsOnlyMatMulDep(__isl_keep isl_map *Schedule, + const Dependences *D, int Pos) { + auto *WAR = D->getDependences(Dependences::TYPE_WAR); + if (!isl_union_map_is_empty(WAR)) { + isl_union_map_free(WAR); + return false; + } + isl_union_map_free(WAR); + auto *RAW = D->getDependences(Dependences::TYPE_RAW); + auto *Domain = isl_map_domain(isl_map_copy(Schedule)); + auto *Space = isl_space_map_from_domain_and_range(isl_set_get_space(Domain), + isl_set_get_space(Domain)); + isl_set_free(Domain); + auto *Deltas = isl_map_deltas(isl_union_map_extract_map(RAW, Space)); + int DeltasDimNum = isl_set_dim(Deltas, isl_dim_set); + for (int i = 0; i < DeltasDimNum; i++) { + auto *Val = isl_set_plain_get_val_if_fixed(Deltas, isl_dim_set, i); + if (isl_val_is_nan(Val) || + !(isl_val_is_zero(Val) || (i == Pos && isl_val_is_one(Val)))) { + isl_val_free(Val); + isl_union_map_free(RAW); + isl_set_free(Deltas); + return false; + } + isl_val_free(Val); + } + isl_union_map_free(RAW); + isl_set_free(Deltas); + return true; +} + +/// Check the form of the access relation. +/// +/// Check that the access relation @p AccMap has the form M[i][j], where i +/// is a @p FirstPos and j is a @p SecondPos. +/// +/// @param AccMap The access relation to be checked. +/// @param FirstPos The index of the input dimension that is mapped to +/// the first output dimension. +/// @param SecondPos The index of the input dimension that is mapped to the +/// second output dimension. +/// @return True in case @p AccMap has the expected form and false, +/// otherwise. +static bool isMatMulOperandAcc(__isl_keep isl_map *AccMap, int &FirstPos, + int &SecondPos) { + int DimInPos[] = {FirstPos, SecondPos}; + if (isl_map_foreach_basic_map(AccMap, isMatMulOperandBasicMap, + static_cast(DimInPos)) != isl_stat_ok || + DimInPos[0] < 0 || DimInPos[1] < 0) + return false; + FirstPos = DimInPos[0]; + SecondPos = DimInPos[1]; + return true; +} + +/// Does the SCoP statement contain non-scalar operands of the matrix +/// multiplication. +/// +/// Check that the accesses to arrays, which are represented by the access +/// relations of the SCoP statement @p Stmt, are accesses to the non-scalar +/// operands of the matrix multiplication or its result. +/// +/// @param Stmt The SCoP statement to be checked. +/// @param MMI Parameters of the matrix multiplication operands. +/// @return True in case the corresponding SCoP statement represents matrix +/// multiplication and false, otherwise. +static bool containsMatMulReadAccesses(const ScopStmt *Stmt, + MatMulInfoTy &MMI) { + for (auto *MemA = Stmt->begin(); MemA != Stmt->end(); MemA++) { + auto *MemAccessPtr = (*MemA); + if (MemAccessPtr == MMI.WriteToC || !MemAccessPtr->isArrayKind()) + continue; + if (!MemAccessPtr->isRead()) + return false; + isl_map *AccMap = MemAccessPtr->getAccessRelation(); + if (isMatMulOperandAcc(AccMap, MMI.i, MMI.j)) { + if (MMI.ReadFromC || isl_map_n_basic_map(AccMap) != 1) { + isl_map_free(AccMap); + return false; + } + MMI.ReadFromC = MemAccessPtr; + } + if (isMatMulOperandAcc(AccMap, MMI.i, MMI.k)) { + if (MMI.A || isl_map_n_basic_map(AccMap) != 1) { + isl_map_free(AccMap); + return false; + } + MMI.A = MemAccessPtr; + } + if (isMatMulOperandAcc(AccMap, MMI.k, MMI.j)) { + if (MMI.B || isl_map_n_basic_map(AccMap) != 1) { + isl_map_free(AccMap); + return false; + } + MMI.B = MemAccessPtr; + } + isl_map_free(AccMap); + } + return true; } /// Check if the SCoP statement could probably be optimized with analytical @@ -495,50 +771,55 @@ /// /// containsMatrMult tries to determine whether the following conditions /// are true: -/// 1. all memory accesses of the statement will have stride 0 or 1, -/// if we interchange loops (switch the variable used in the inner -/// loop to the outer loop). -/// 2. all memory accesses of the statement except from the last one, are -/// read memory access and the last one is write memory access. -/// 3. all subscripts of the last memory access of the statement don't contain -/// the variable used in the inner loop. +/// 1. The last memory access modeling an array, MA1, represents writing to +/// memory and has the form S(..., i1, ..., i2, ...) -> M(i1, i2) or +/// S(..., i2, ..., i1, ...) -> M(i1, i2), where S is the SCoP statement +/// under consideration. +/// 2. SCoP contains three access relations, MA2 , MA3, and MA4 that represent +/// reading from memory and have the form S(..., i3, ...) -> M(i1, i3), +/// S(..., i3, ...) -> M(i3, i2), S(...) -> M(i1, i2), respectively. +/// 3. There are no loop-carried anti or true dependencies in any loop of +/// the loop nest expect the true dependencies of the form +/// S(..., i3, ...) -> S(..., i3 + 1, ..). +/// 4. Memory accesses of the SCoP that are different from MA1, MA2, MA3, +/// and MA4 have stride 0, if the innermost loop is exchanged with any +/// of loops i1, i2 and i3. /// /// @param PartialSchedule The PartialSchedule that contains a SCoP statement /// to check. -static bool containsMatrMult(__isl_keep isl_map *PartialSchedule) { - auto InputDimsId = isl_map_get_tuple_id(PartialSchedule, isl_dim_in); - auto *ScpStmt = static_cast(isl_id_get_user(InputDimsId)); +/// @D The SCoP dependencies. +/// @MMI Parameters of the matrix multiplication operands. +static bool containsMatrMult(__isl_keep isl_map *PartialSchedule, + const Dependences *D, MatMulInfoTy &MMI) { + auto *InputDimsId = isl_map_get_tuple_id(PartialSchedule, isl_dim_in); + auto *Stmt = static_cast(isl_id_get_user(InputDimsId)); isl_id_free(InputDimsId); - if (ScpStmt->size() <= 1) + if (Stmt->size() <= 1) return false; - auto MemA = ScpStmt->begin(); - for (unsigned i = 0; i < ScpStmt->size() - 2 && MemA != ScpStmt->end(); - i++, MemA++) - if (!(*MemA)->isRead() || - ((*MemA)->isArrayKind() && - !((*MemA)->isStrideOne(isl_map_copy(PartialSchedule)) || - (*MemA)->isStrideZero(isl_map_copy(PartialSchedule))))) + for (auto *MemA = Stmt->end() - 1; MemA != Stmt->begin(); MemA--) { + auto *MemAccessPtr = *MemA; + if (!MemAccessPtr->isArrayKind()) + continue; + if (!MemAccessPtr->isWrite()) return false; - MemA++; - if (!(*MemA)->isWrite() || !(*MemA)->isArrayKind() || - !((*MemA)->isStrideOne(isl_map_copy(PartialSchedule)) || - (*MemA)->isStrideZero(isl_map_copy(PartialSchedule)))) - return false; - auto DimNum = isl_map_dim(PartialSchedule, isl_dim_in); - return !isInputDimUsed((*MemA)->getAccessRelation(), DimNum - 1); -} + auto *AccMap = MemAccessPtr->getAccessRelation(); + if (isl_map_n_basic_map(AccMap) != 1 || + !isMatMulOperandAcc(AccMap, MMI.i, MMI.j)) { + isl_map_free(AccMap); + return false; + } + isl_map_free(AccMap); + MMI.WriteToC = MemAccessPtr; + break; + } -/// Circular shift of output dimensions of the integer map. -/// -/// @param IslMap The isl map to be modified. -static __isl_give isl_map *circularShiftOutputDims(__isl_take isl_map *IslMap) { - auto DimNum = isl_map_dim(IslMap, isl_dim_out); - if (DimNum == 0) - return IslMap; - auto InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_in); - IslMap = isl_map_move_dims(IslMap, isl_dim_in, 0, isl_dim_out, DimNum - 1, 1); - IslMap = isl_map_move_dims(IslMap, isl_dim_out, 0, isl_dim_in, 0, 1); - return isl_map_set_tuple_id(IslMap, isl_dim_in, InputDimsId); + if (!MMI.WriteToC || !containsMatMulReadAccesses(Stmt, MMI)) + return false; + if (!MMI.A || !MMI.B || !MMI.ReadFromC) + return false; + if (!containsOnlyMatrMultAcc(PartialSchedule, MMI)) + return false; + return containsOnlyMatMulDep(PartialSchedule, D, MMI.k); } /// Permute two dimensions of the band node. @@ -581,12 +862,15 @@ if (MacroKernelParams.Mc == 1 && MacroKernelParams.Nc == 1 && MacroKernelParams.Kc == 1) return Node; - Node = tileNode( - Node, "1st level tiling", - {MacroKernelParams.Mc, MacroKernelParams.Nc, MacroKernelParams.Kc}, 1); + int DimOutNum = isl_schedule_node_band_n_member(Node); + std::vector TileSizes(DimOutNum, 1); + TileSizes[DimOutNum - 3] = MacroKernelParams.Mc; + TileSizes[DimOutNum - 2] = MacroKernelParams.Nc; + TileSizes[DimOutNum - 1] = MacroKernelParams.Kc; + Node = tileNode(Node, "1st level tiling", TileSizes, 1); Node = isl_schedule_node_parent(isl_schedule_node_parent(Node)); - Node = permuteBandNodeDimensions(Node, 1, 2); - Node = permuteBandNodeDimensions(Node, 0, 2); + Node = permuteBandNodeDimensions(Node, DimOutNum - 2, DimOutNum - 1); + Node = permuteBandNodeDimensions(Node, DimOutNum - 3, DimOutNum - 1); return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0); } @@ -659,165 +943,6 @@ return {Mc, Nc, Kc}; } -/// Identify a memory access through the shape of its memory access relation. -/// -/// Identify the unique memory access in @p Stmt, that has an access relation -/// equal to @p ExpectedAccessRelation. -/// -/// @param Stmt The SCoP statement that contains the memory accesses under -/// consideration. -/// @param ExpectedAccessRelation The access relation that identifies -/// the memory access. -/// @return The memory access of @p Stmt whose memory access relation is equal -/// to @p ExpectedAccessRelation. nullptr in case there is no or more -/// than one such access. -MemoryAccess * -identifyAccessByAccessRelation(ScopStmt *Stmt, - __isl_take isl_map *ExpectedAccessRelation) { - if (isl_map_has_tuple_id(ExpectedAccessRelation, isl_dim_out)) - ExpectedAccessRelation = - isl_map_reset_tuple_id(ExpectedAccessRelation, isl_dim_out); - MemoryAccess *IdentifiedAccess = nullptr; - for (auto *Access : *Stmt) { - auto *AccessRelation = Access->getAccessRelation(); - AccessRelation = isl_map_reset_tuple_id(AccessRelation, isl_dim_out); - if (isl_map_is_equal(ExpectedAccessRelation, AccessRelation)) { - if (IdentifiedAccess) { - isl_map_free(AccessRelation); - isl_map_free(ExpectedAccessRelation); - return nullptr; - } - IdentifiedAccess = Access; - } - isl_map_free(AccessRelation); - } - isl_map_free(ExpectedAccessRelation); - return IdentifiedAccess; -} - -/// Add constrains to @Dim dimension of @p ExtMap. -/// -/// If @ExtMap has the following form [O0, O1, O2]->[I1, I2, I3], -/// the following constraint will be added -/// Bound * OM <= IM <= Bound * (OM + 1) - 1, -/// where M is @p Dim and Bound is @p Bound. -/// -/// @param ExtMap The isl map to be modified. -/// @param Dim The output dimension to be modfied. -/// @param Bound The value that is used to specify the constraint. -/// @return The modified isl map -__isl_give isl_map * -addExtensionMapMatMulDimConstraint(__isl_take isl_map *ExtMap, unsigned Dim, - unsigned Bound) { - assert(Bound != 0); - auto *ExtMapSpace = isl_map_get_space(ExtMap); - auto *ConstrSpace = isl_local_space_from_space(ExtMapSpace); - auto *Constr = - isl_constraint_alloc_inequality(isl_local_space_copy(ConstrSpace)); - Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_out, Dim, 1); - Constr = - isl_constraint_set_coefficient_si(Constr, isl_dim_in, Dim, Bound * (-1)); - ExtMap = isl_map_add_constraint(ExtMap, Constr); - Constr = isl_constraint_alloc_inequality(ConstrSpace); - Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_out, Dim, -1); - Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_in, Dim, Bound); - Constr = isl_constraint_set_constant_si(Constr, Bound - 1); - return isl_map_add_constraint(ExtMap, Constr); -} - -/// Create an access relation that is specific for matrix multiplication -/// pattern. -/// -/// Create an access relation of the following form: -/// { [O0, O1, O2]->[I1, I2, I3] : -/// FirstOutputDimBound * O0 <= I1 <= FirstOutputDimBound * (O0 + 1) - 1 -/// and SecondOutputDimBound * O1 <= I2 <= SecondOutputDimBound * (O1 + 1) - 1 -/// and ThirdOutputDimBound * O2 <= I3 <= ThirdOutputDimBound * (O2 + 1) - 1} -/// where FirstOutputDimBound is @p FirstOutputDimBound, -/// SecondOutputDimBound is @p SecondOutputDimBound, -/// ThirdOutputDimBound is @p ThirdOutputDimBound -/// -/// @param Ctx The isl context. -/// @param FirstOutputDimBound, -/// SecondOutputDimBound, -/// ThirdOutputDimBound The parameters of the access relation. -/// @return The specified access relation. -__isl_give isl_map *getMatMulExt(isl_ctx *Ctx, unsigned FirstOutputDimBound, - unsigned SecondOutputDimBound, - unsigned ThirdOutputDimBound) { - auto *NewRelSpace = isl_space_alloc(Ctx, 0, 3, 3); - auto *extensionMap = isl_map_universe(NewRelSpace); - if (!FirstOutputDimBound) - extensionMap = isl_map_fix_si(extensionMap, isl_dim_out, 0, 0); - else - extensionMap = addExtensionMapMatMulDimConstraint(extensionMap, 0, - FirstOutputDimBound); - if (!SecondOutputDimBound) - extensionMap = isl_map_fix_si(extensionMap, isl_dim_out, 1, 0); - else - extensionMap = addExtensionMapMatMulDimConstraint(extensionMap, 1, - SecondOutputDimBound); - if (!ThirdOutputDimBound) - extensionMap = isl_map_fix_si(extensionMap, isl_dim_out, 2, 0); - else - extensionMap = addExtensionMapMatMulDimConstraint(extensionMap, 2, - ThirdOutputDimBound); - return extensionMap; -} - -/// Create an access relation that is specific to the matrix -/// multiplication pattern. -/// -/// Create an access relation of the following form: -/// Stmt[O0, O1, O2]->[OI, OJ], -/// where I is @p I, J is @J -/// -/// @param Stmt The SCoP statement for which to generate the access relation. -/// @param I The index of the input dimension that is mapped to the first output -/// dimension. -/// @param J The index of the input dimension that is mapped to the second -/// output dimension. -/// @return The specified access relation. -__isl_give isl_map * -getMatMulPatternOriginalAccessRelation(ScopStmt *Stmt, unsigned I, unsigned J) { - auto *AccessRelSpace = isl_space_alloc(Stmt->getIslCtx(), 0, 3, 2); - auto *AccessRel = isl_map_universe(AccessRelSpace); - AccessRel = isl_map_equate(AccessRel, isl_dim_in, I, isl_dim_out, 0); - AccessRel = isl_map_equate(AccessRel, isl_dim_in, J, isl_dim_out, 1); - AccessRel = isl_map_set_tuple_id(AccessRel, isl_dim_in, Stmt->getDomainId()); - return AccessRel; -} - -/// Identify the memory access that corresponds to the access to the second -/// operand of the matrix multiplication. -/// -/// Identify the memory access that corresponds to the access -/// to the matrix B of the matrix multiplication C = A x B. -/// -/// @param Stmt The SCoP statement that contains the memory accesses -/// under consideration. -/// @return The memory access of @p Stmt that corresponds to the access -/// to the second operand of the matrix multiplication. -MemoryAccess *identifyAccessA(ScopStmt *Stmt) { - auto *OriginalRel = getMatMulPatternOriginalAccessRelation(Stmt, 0, 2); - return identifyAccessByAccessRelation(Stmt, OriginalRel); -} - -/// Identify the memory access that corresponds to the access to the first -/// operand of the matrix multiplication. -/// -/// Identify the memory access that corresponds to the access -/// to the matrix A of the matrix multiplication C = A x B. -/// -/// @param Stmt The SCoP statement that contains the memory accesses -/// under consideration. -/// @return The memory access of @p Stmt that corresponds to the access -/// to the first operand of the matrix multiplication. -MemoryAccess *identifyAccessB(ScopStmt *Stmt) { - auto *OriginalRel = getMatMulPatternOriginalAccessRelation(Stmt, 2, 1); - return identifyAccessByAccessRelation(Stmt, OriginalRel); -} - /// Create an access relation that is specific to /// the matrix multiplication pattern. /// @@ -876,21 +1001,15 @@ /// transformations. /// @param MicroParams, MacroParams Parameters of the BLIS kernel /// to be taken into account. +/// @param MMI Parameters of the matrix multiplication operands. /// @return The optimized schedule node. static __isl_give isl_schedule_node *optimizeDataLayoutMatrMulPattern( __isl_take isl_schedule_node *Node, __isl_take isl_map *MapOldIndVar, - MicroKernelParamsTy MicroParams, MacroKernelParamsTy MacroParams) { - // Check whether memory accesses of the SCoP statement correspond to - // the matrix multiplication pattern and if this is true, obtain them. + MicroKernelParamsTy MicroParams, MacroKernelParamsTy MacroParams, + MatMulInfoTy &MMI) { auto InputDimsId = isl_map_get_tuple_id(MapOldIndVar, isl_dim_in); auto *Stmt = static_cast(isl_id_get_user(InputDimsId)); isl_id_free(InputDimsId); - MemoryAccess *MemAccessA = identifyAccessA(Stmt); - MemoryAccess *MemAccessB = identifyAccessB(Stmt); - if (!MemAccessA || !MemAccessB) { - isl_map_free(MapOldIndVar); - return Node; - } // Create a copy statement that corresponds to the memory access to the // matrix B, the second operand of the matrix multiplication. @@ -903,23 +1022,23 @@ unsigned SecondDimSize = MacroParams.Kc; unsigned ThirdDimSize = MicroParams.Nr; auto *SAI = Stmt->getParent()->createScopArrayInfo( - MemAccessB->getElementType(), "Packed_B", + MMI.B->getElementType(), "Packed_B", {FirstDimSize, SecondDimSize, ThirdDimSize}); AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId()); - auto *OldAcc = MemAccessB->getAccessRelation(); - MemAccessB->setNewAccessRelation(AccRel); + auto *OldAcc = MMI.B->getAccessRelation(); + MMI.B->setNewAccessRelation(AccRel); auto *ExtMap = - getMatMulExt(Stmt->getIslCtx(), 0, MacroParams.Nc, MacroParams.Kc); - isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 0, 1); - isl_map_move_dims(ExtMap, isl_dim_in, 2, isl_dim_out, 0, 1); - ExtMap = isl_map_project_out(ExtMap, isl_dim_in, 2, 1); + isl_map_project_out(isl_map_copy(MapOldIndVar), isl_dim_out, 2, + isl_map_dim(MapOldIndVar, isl_dim_out) - 2); + ExtMap = isl_map_reverse(ExtMap); + ExtMap = isl_map_fix_si(ExtMap, isl_dim_out, MMI.i, 0); auto *Domain = Stmt->getDomain(); // Restrict the domains of the copy statements to only execute when also its // originating statement is executed. auto *DomainId = isl_set_get_tuple_id(Domain); auto *NewStmt = Stmt->getParent()->addScopStmt( - OldAcc, MemAccessB->getAccessRelation(), isl_set_copy(Domain)); + OldAcc, MMI.B->getAccessRelation(), isl_set_copy(Domain)); ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, isl_id_copy(DomainId)); ExtMap = isl_map_intersect_range(ExtMap, isl_set_copy(Domain)); ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId()); @@ -928,20 +1047,21 @@ // Create a copy statement that corresponds to the memory access // to the matrix A, the first operand of the matrix multiplication. Node = isl_schedule_node_child(Node, 0); - AccRel = getMatMulAccRel(MapOldIndVar, 4, 6); + AccRel = getMatMulAccRel(isl_map_copy(MapOldIndVar), 4, 6); FirstDimSize = MacroParams.Mc / MicroParams.Mr; ThirdDimSize = MicroParams.Mr; SAI = Stmt->getParent()->createScopArrayInfo( - MemAccessA->getElementType(), "Packed_A", + MMI.A->getElementType(), "Packed_A", {FirstDimSize, SecondDimSize, ThirdDimSize}); AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId()); - OldAcc = MemAccessA->getAccessRelation(); - MemAccessA->setNewAccessRelation(AccRel); - ExtMap = getMatMulExt(Stmt->getIslCtx(), MacroParams.Mc, 0, MacroParams.Kc); - isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 0, 1); - isl_map_move_dims(ExtMap, isl_dim_in, 2, isl_dim_out, 0, 1); - NewStmt = Stmt->getParent()->addScopStmt( - OldAcc, MemAccessA->getAccessRelation(), isl_set_copy(Domain)); + OldAcc = MMI.A->getAccessRelation(); + MMI.A->setNewAccessRelation(AccRel); + ExtMap = isl_map_project_out(MapOldIndVar, isl_dim_out, 3, + isl_map_dim(MapOldIndVar, isl_dim_out) - 3); + ExtMap = isl_map_reverse(ExtMap); + ExtMap = isl_map_fix_si(ExtMap, isl_dim_out, MMI.j, 0); + NewStmt = Stmt->getParent()->addScopStmt(OldAcc, MMI.A->getAccessRelation(), + isl_set_copy(Domain)); // Restrict the domains of the copy statements to only execute when also its // originating statement is executed. @@ -981,8 +1101,19 @@ } __isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeMatMulPattern( - __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) { + __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI, + MatMulInfoTy &MMI) { assert(TTI && "The target transform info should be provided."); + int DimOutNum = isl_schedule_node_band_n_member(Node); + assert(DimOutNum > 2 && "In case of the matrix multiplication the loop nest " + "and, consequently, the corresponding scheduling " + "functions have at least three dimensions."); + Node = permuteBandNodeDimensions(Node, MMI.i, DimOutNum - 3); + int NewJ = MMI.j == DimOutNum - 3 ? MMI.i : MMI.j; + int NewK = MMI.k == DimOutNum - 3 ? MMI.i : MMI.k; + Node = permuteBandNodeDimensions(Node, NewJ, DimOutNum - 2); + NewK = MMI.k == DimOutNum - 2 ? MMI.j : MMI.k; + Node = permuteBandNodeDimensions(Node, NewK, DimOutNum - 1); auto MicroKernelParams = getMicroKernelParams(TTI); auto MacroKernelParams = getMacroKernelParams(MicroKernelParams); Node = createMacroKernel(Node, MacroKernelParams); @@ -995,21 +1126,21 @@ if (!MapOldIndVar) return Node; return optimizeDataLayoutMatrMulPattern(Node, MapOldIndVar, MicroKernelParams, - MacroKernelParams); + MacroKernelParams, MMI); } bool ScheduleTreeOptimizer::isMatrMultPattern( - __isl_keep isl_schedule_node *Node) { + __isl_keep isl_schedule_node *Node, const Dependences *D, + MatMulInfoTy &MMI) { auto *PartialSchedule = isl_schedule_node_band_get_partial_schedule_union_map(Node); - if (isl_schedule_node_band_n_member(Node) != 3 || + if (isl_schedule_node_band_n_member(Node) < 3 || isl_union_map_n_map(PartialSchedule) != 1) { isl_union_map_free(PartialSchedule); return false; } auto *NewPartialSchedule = isl_map_from_union_map(PartialSchedule); - NewPartialSchedule = circularShiftOutputDims(NewPartialSchedule); - if (containsMatrMult(NewPartialSchedule)) { + if (containsMatrMult(NewPartialSchedule, D, MMI)) { isl_map_free(NewPartialSchedule); return true; } @@ -1023,11 +1154,13 @@ if (!isTileableBandNode(Node)) return Node; - if (PMBasedOpts && User && isMatrMultPattern(Node)) { + const OptimizerAdditionalInfoTy *OAI = + static_cast(User); + + MatMulInfoTy MMI; + if (PMBasedOpts && User && isMatrMultPattern(Node, OAI->D, MMI)) { DEBUG(dbgs() << "The matrix multiplication pattern was detected\n"); - const llvm::TargetTransformInfo *TTI; - TTI = static_cast(User); - Node = optimizeMatMulPattern(Node, TTI); + Node = optimizeMatMulPattern(Node, OAI->TTI, MMI); } return standardBandOpts(Node, User); @@ -1035,9 +1168,9 @@ __isl_give isl_schedule * ScheduleTreeOptimizer::optimizeSchedule(__isl_take isl_schedule *Schedule, - const llvm::TargetTransformInfo *TTI) { + const OptimizerAdditionalInfoTy *OAI) { isl_schedule_node *Root = isl_schedule_get_root(Schedule); - Root = optimizeScheduleNode(Root, TTI); + Root = optimizeScheduleNode(Root, OAI); isl_schedule_free(Schedule); auto S = isl_schedule_node_get_schedule(Root); isl_schedule_node_free(Root); @@ -1045,9 +1178,9 @@ } __isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeScheduleNode( - __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) { + __isl_take isl_schedule_node *Node, const OptimizerAdditionalInfoTy *OAI) { Node = isl_schedule_node_map_descendant_bottom_up( - Node, optimizeBand, const_cast(static_cast(TTI))); + Node, optimizeBand, const_cast(static_cast(OAI))); return Node; } @@ -1248,8 +1381,9 @@ Function &F = S.getFunction(); auto *TTI = &getAnalysis().getTTI(F); + const OptimizerAdditionalInfoTy OAI = {TTI, const_cast(&D)}; isl_schedule *NewSchedule = - ScheduleTreeOptimizer::optimizeSchedule(Schedule, TTI); + ScheduleTreeOptimizer::optimizeSchedule(Schedule, &OAI); if (!ScheduleTreeOptimizer::isProfitableSchedule(S, NewSchedule)) { isl_schedule_free(NewSchedule); Index: test/ScheduleOptimizer/pattern-matching-based-opts.ll =================================================================== --- test/ScheduleOptimizer/pattern-matching-based-opts.ll +++ test/ScheduleOptimizer/pattern-matching-based-opts.ll @@ -15,63 +15,49 @@ ; PATTERN-MATCHING-OPTS: The matrix multiplication pattern was detected ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-unknown" -define internal void @kernel_gemm(i32 %arg, i32 %arg1, i32 %arg2, double %arg3, double %arg4, [1056 x double]* %arg5, [1024 x double]* %arg6, [1056 x double]* %arg7) { +define internal void @kernel_gemm(i32 %arg, i32 %arg1, i32 %arg2, double %arg3, double %arg4, [1056 x double]* %arg5, [1024 x double]* %arg6, [1056 x double]* %arg7) #0 { bb: br label %bb8 -bb8: ; preds = %bb39, %bb - %tmp = phi i32 [ 0, %bb ], [ %tmp40, %bb39 ] - %tmp9 = icmp slt i32 %tmp, 1056 - br i1 %tmp9, label %bb10, label %bb41 - -bb10: ; preds = %bb8 - br label %bb11 - -bb11: ; preds = %bb37, %bb10 - %tmp12 = phi i32 [ 0, %bb10 ], [ %tmp38, %bb37 ] - %tmp13 = icmp slt i32 %tmp12, 1056 - br i1 %tmp13, label %bb14, label %bb39 - -bb14: ; preds = %bb11 - %tmp15 = sext i32 %tmp12 to i64 - %tmp16 = sext i32 %tmp to i64 - %tmp17 = getelementptr inbounds [1056 x double], [1056 x double]* %arg5, i64 %tmp16 - %tmp18 = getelementptr inbounds [1056 x double], [1056 x double]* %tmp17, i64 0, i64 %tmp15 - %tmp19 = load double, double* %tmp18, align 8 - %tmp20 = fmul double %tmp19, %arg4 - store double %tmp20, double* %tmp18, align 8 - br label %bb21 - -bb21: ; preds = %bb24, %bb14 - %tmp22 = phi i32 [ 0, %bb14 ], [ %tmp36, %bb24 ] - %tmp23 = icmp slt i32 %tmp22, 1024 - br i1 %tmp23, label %bb24, label %bb37 - -bb24: ; preds = %bb21 - %tmp25 = sext i32 %tmp22 to i64 - %tmp26 = getelementptr inbounds [1024 x double], [1024 x double]* %arg6, i64 %tmp16 - %tmp27 = getelementptr inbounds [1024 x double], [1024 x double]* %tmp26, i64 0, i64 %tmp25 - %tmp28 = load double, double* %tmp27, align 8 - %tmp29 = fmul double %arg3, %tmp28 - %tmp30 = getelementptr inbounds [1056 x double], [1056 x double]* %arg7, i64 %tmp25 - %tmp31 = getelementptr inbounds [1056 x double], [1056 x double]* %tmp30, i64 0, i64 %tmp15 - %tmp32 = load double, double* %tmp31, align 8 - %tmp33 = fmul double %tmp29, %tmp32 - %tmp34 = load double, double* %tmp18, align 8 - %tmp35 = fadd double %tmp34, %tmp33 - store double %tmp35, double* %tmp18, align 8 - %tmp36 = add nsw i32 %tmp22, 1 - br label %bb21 - -bb37: ; preds = %bb21 - %tmp38 = add nsw i32 %tmp12, 1 - br label %bb11 - -bb39: ; preds = %bb11 - %tmp40 = add nsw i32 %tmp, 1 - br label %bb8 - -bb41: ; preds = %bb8 +bb8: ; preds = %bb29, %bb + %tmp = phi i64 [ 0, %bb ], [ %tmp30, %bb29 ] + br label %bb9 + +bb9: ; preds = %bb26, %bb8 + %tmp10 = phi i64 [ 0, %bb8 ], [ %tmp27, %bb26 ] + %tmp11 = getelementptr inbounds [1056 x double], [1056 x double]* %arg5, i64 %tmp, i64 %tmp10 + %tmp12 = load double, double* %tmp11, align 8 + %tmp13 = fmul double %tmp12, %arg4 + store double %tmp13, double* %tmp11, align 8 + br label %Copy_0 + +Copy_0: ; preds = %Copy_0, %bb9 + %tmp15 = phi i64 [ 0, %bb9 ], [ %tmp24, %Copy_0 ] + %tmp16 = getelementptr inbounds [1024 x double], [1024 x double]* %arg6, i64 %tmp, i64 %tmp15 + %tmp17 = load double, double* %tmp16, align 8 + %tmp18 = fmul double %tmp17, %arg3 + %tmp19 = getelementptr inbounds [1056 x double], [1056 x double]* %arg7, i64 %tmp15, i64 %tmp10 + %tmp20 = load double, double* %tmp19, align 8 + %tmp21 = fmul double %tmp18, %tmp20 + %tmp22 = load double, double* %tmp11, align 8 + %tmp23 = fadd double %tmp22, %tmp21 + store double %tmp23, double* %tmp11, align 8 + %tmp24 = add nuw nsw i64 %tmp15, 1 + %tmp25 = icmp ne i64 %tmp24, 1024 + br i1 %tmp25, label %Copy_0, label %bb26 + +bb26: ; preds = %Copy_0 + %tmp27 = add nuw nsw i64 %tmp10, 1 + %tmp28 = icmp ne i64 %tmp27, 1056 + br i1 %tmp28, label %bb9, label %bb29 + +bb29: ; preds = %bb26 + %tmp30 = add nuw nsw i64 %tmp, 1 + %tmp31 = icmp ne i64 %tmp30, 1056 + br i1 %tmp31, label %bb8, label %bb32 + +bb32: ; preds = %bb29 ret void } Index: test/ScheduleOptimizer/pattern-matching-based-opts_2.ll =================================================================== --- test/ScheduleOptimizer/pattern-matching-based-opts_2.ll +++ test/ScheduleOptimizer/pattern-matching-based-opts_2.ll @@ -17,63 +17,49 @@ ; CHECK-NOT: The matrix multiplication pattern was detected ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-unknown" -define internal void @kernel_gemm(i32 %arg, i32 %arg1, i32 %arg2, double %arg3, double %arg4, [1056 x double]* %arg5, [1024 x double]* %arg6, [1056 x double]* %arg7) { +define internal void @kernel_gemm(i32 %arg, i32 %arg1, i32 %arg2, double %arg3, double %arg4, [1056 x double]* %arg5, [1024 x double]* %arg6, [1056 x double]* %arg7) #0 { bb: br label %bb8 -bb8: ; preds = %bb39, %bb - %tmp = phi i32 [ 0, %bb ], [ %tmp40, %bb39 ] - %tmp9 = icmp slt i32 %tmp, 1056 - br i1 %tmp9, label %bb10, label %bb41 +bb8: ; preds = %bb29, %bb + %tmp = phi i64 [ 0, %bb ], [ %tmp30, %bb29 ] + br label %bb9 -bb10: ; preds = %bb8 - br label %bb11 +bb9: ; preds = %bb26, %bb8 + %tmp10 = phi i64 [ 0, %bb8 ], [ %tmp27, %bb26 ] + %tmp11 = getelementptr inbounds [1056 x double], [1056 x double]* %arg5, i64 %tmp, i64 %tmp10 + %tmp12 = load double, double* %tmp11, align 8 + %tmp13 = fmul double %tmp12, %arg4 + store double %tmp13, double* %tmp11, align 8 + br label %Copy_0 -bb11: ; preds = %bb37, %bb10 - %tmp12 = phi i32 [ 0, %bb10 ], [ %tmp38, %bb37 ] - %tmp13 = icmp slt i32 %tmp12, 1056 - br i1 %tmp13, label %bb14, label %bb39 +Copy_0: ; preds = %Copy_0, %bb9 + %tmp15 = phi i64 [ 0, %bb9 ], [ %tmp24, %Copy_0 ] + %tmp16 = getelementptr inbounds [1024 x double], [1024 x double]* %arg6, i64 %tmp, i64 %tmp15 + %tmp17 = load double, double* %tmp16, align 8 + %tmp18 = fmul double %tmp17, %arg3 + %tmp19 = getelementptr inbounds [1056 x double], [1056 x double]* %arg7, i64 %tmp15, i64 %tmp10 + %tmp20 = load double, double* %tmp19, align 8 + %tmp21 = fmul double %tmp18, %tmp20 + %tmp22 = load double, double* %tmp11, align 8 + %tmp23 = fadd double %tmp22, %tmp21 + store double %tmp23, double* %tmp11, align 8 + %tmp24 = add nuw nsw i64 %tmp15, 1 + %tmp25 = icmp ne i64 %tmp24, 1024 + br i1 %tmp25, label %Copy_0, label %bb26 -bb14: ; preds = %bb11 - %tmp15 = sext i32 %tmp12 to i64 - %tmp16 = sext i32 %tmp to i64 - %tmp17 = getelementptr inbounds [1056 x double], [1056 x double]* %arg5, i64 %tmp16 - %tmp18 = getelementptr inbounds [1056 x double], [1056 x double]* %tmp17, i64 0, i64 %tmp15 - %tmp19 = load double, double* %tmp18, align 8 - %tmp20 = fmul double %tmp19, %arg4 - store double %tmp20, double* %tmp18, align 8 - br label %bb21 +bb26: ; preds = %Copy_0 + %tmp27 = add nuw nsw i64 %tmp10, 2 + %tmp28 = icmp ne i64 %tmp27, 1056 + br i1 %tmp28, label %bb9, label %bb29 -bb21: ; preds = %bb24, %bb14 - %tmp22 = phi i32 [ 0, %bb14 ], [ %tmp36, %bb24 ] - %tmp23 = icmp slt i32 %tmp22, 1024 - br i1 %tmp23, label %bb24, label %bb37 +bb29: ; preds = %bb26 + %tmp30 = add nuw nsw i64 %tmp, 1 + %tmp31 = icmp ne i64 %tmp30, 1056 + br i1 %tmp31, label %bb8, label %bb32 -bb24: ; preds = %bb21 - %tmp25 = sext i32 %tmp22 to i64 - %tmp26 = getelementptr inbounds [1024 x double], [1024 x double]* %arg6, i64 %tmp16 - %tmp27 = getelementptr inbounds [1024 x double], [1024 x double]* %tmp26, i64 0, i64 %tmp25 - %tmp28 = load double, double* %tmp27, align 8 - %tmp29 = fmul double %arg3, %tmp28 - %tmp30 = getelementptr inbounds [1056 x double], [1056 x double]* %arg7, i64 %tmp25 - %tmp31 = getelementptr inbounds [1056 x double], [1056 x double]* %tmp30, i64 0, i64 %tmp15 - %tmp32 = load double, double* %tmp31, align 8 - %tmp33 = fmul double %tmp29, %tmp32 - %tmp34 = load double, double* %tmp18, align 8 - %tmp35 = fadd double %tmp34, %tmp33 - store double %tmp35, double* %tmp18, align 8 - %tmp36 = add nsw i32 %tmp22, 1 - br label %bb21 - -bb37: ; preds = %bb21 - %tmp38 = add nsw i32 %tmp12, 2 - br label %bb11 - -bb39: ; preds = %bb11 - %tmp40 = add nsw i32 %tmp, 1 - br label %bb8 - -bb41: ; preds = %bb8 +bb32: ; preds = %bb29 ret void } Index: test/ScheduleOptimizer/pattern-matching-based-opts_3.ll =================================================================== --- test/ScheduleOptimizer/pattern-matching-based-opts_3.ll +++ test/ScheduleOptimizer/pattern-matching-based-opts_3.ll @@ -31,7 +31,7 @@ ; CHECK-NEXT: // 1st level tiling - Points ; CHECK-NEXT: for (int c2 = 0; c2 <= 31; c2 += 1) ; CHECK-NEXT: for (int c3 = 0; c3 <= 31; c3 += 1) -; CHECK-NEXT: Stmt_bb14(32 * c0 + c2, 32 * c1 + c3); +; CHECK-NEXT: Stmt_bb9(32 * c0 + c2, 32 * c1 + c3); ; CHECK-NEXT: } ; CHECK-NEXT: // Register tiling - Tiles ; CHECK-NEXT: for (int c0 = 0; c0 <= 131; c0 += 1) @@ -41,38 +41,38 @@ ; CHECK-NEXT: // 1st level tiling - Tiles ; CHECK-NEXT: // 1st level tiling - Points ; CHECK-NEXT: { -; CHECK-NEXT: Stmt_bb24(4 * c1, 8 * c0, c2); -; CHECK-NEXT: Stmt_bb24(4 * c1, 8 * c0 + 1, c2); -; CHECK-NEXT: Stmt_bb24(4 * c1, 8 * c0 + 2, c2); -; CHECK-NEXT: Stmt_bb24(4 * c1, 8 * c0 + 3, c2); -; CHECK-NEXT: Stmt_bb24(4 * c1, 8 * c0 + 4, c2); -; CHECK-NEXT: Stmt_bb24(4 * c1, 8 * c0 + 5, c2); -; CHECK-NEXT: Stmt_bb24(4 * c1, 8 * c0 + 6, c2); -; CHECK-NEXT: Stmt_bb24(4 * c1, 8 * c0 + 7, c2); -; CHECK-NEXT: Stmt_bb24(4 * c1 + 1, 8 * c0, c2); -; CHECK-NEXT: Stmt_bb24(4 * c1 + 1, 8 * c0 + 1, c2); -; CHECK-NEXT: Stmt_bb24(4 * c1 + 1, 8 * c0 + 2, c2); -; CHECK-NEXT: Stmt_bb24(4 * c1 + 1, 8 * c0 + 3, c2); -; CHECK-NEXT: Stmt_bb24(4 * c1 + 1, 8 * c0 + 4, c2); -; CHECK-NEXT: Stmt_bb24(4 * c1 + 1, 8 * c0 + 5, c2); -; CHECK-NEXT: Stmt_bb24(4 * c1 + 1, 8 * c0 + 6, c2); -; CHECK-NEXT: Stmt_bb24(4 * c1 + 1, 8 * c0 + 7, c2); -; CHECK-NEXT: Stmt_bb24(4 * c1 + 2, 8 * c0, c2); -; CHECK-NEXT: Stmt_bb24(4 * c1 + 2, 8 * c0 + 1, c2); -; CHECK-NEXT: Stmt_bb24(4 * c1 + 2, 8 * c0 + 2, c2); -; CHECK-NEXT: Stmt_bb24(4 * c1 + 2, 8 * c0 + 3, c2); -; CHECK-NEXT: Stmt_bb24(4 * c1 + 2, 8 * c0 + 4, c2); -; CHECK-NEXT: Stmt_bb24(4 * c1 + 2, 8 * c0 + 5, c2); -; CHECK-NEXT: Stmt_bb24(4 * c1 + 2, 8 * c0 + 6, c2); -; CHECK-NEXT: Stmt_bb24(4 * c1 + 2, 8 * c0 + 7, c2); -; CHECK-NEXT: Stmt_bb24(4 * c1 + 3, 8 * c0, c2); -; CHECK-NEXT: Stmt_bb24(4 * c1 + 3, 8 * c0 + 1, c2); -; CHECK-NEXT: Stmt_bb24(4 * c1 + 3, 8 * c0 + 2, c2); -; CHECK-NEXT: Stmt_bb24(4 * c1 + 3, 8 * c0 + 3, c2); -; CHECK-NEXT: Stmt_bb24(4 * c1 + 3, 8 * c0 + 4, c2); -; CHECK-NEXT: Stmt_bb24(4 * c1 + 3, 8 * c0 + 5, c2); -; CHECK-NEXT: Stmt_bb24(4 * c1 + 3, 8 * c0 + 6, c2); -; CHECK-NEXT: Stmt_bb24(4 * c1 + 3, 8 * c0 + 7, c2); +; CHECK-NEXT: Stmt_Copy_0(4 * c1, 8 * c0, c2); +; CHECK-NEXT: Stmt_Copy_0(4 * c1, 8 * c0 + 1, c2); +; CHECK-NEXT: Stmt_Copy_0(4 * c1, 8 * c0 + 2, c2); +; CHECK-NEXT: Stmt_Copy_0(4 * c1, 8 * c0 + 3, c2); +; CHECK-NEXT: Stmt_Copy_0(4 * c1, 8 * c0 + 4, c2); +; CHECK-NEXT: Stmt_Copy_0(4 * c1, 8 * c0 + 5, c2); +; CHECK-NEXT: Stmt_Copy_0(4 * c1, 8 * c0 + 6, c2); +; CHECK-NEXT: Stmt_Copy_0(4 * c1, 8 * c0 + 7, c2); +; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 1, 8 * c0, c2); +; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 1, 8 * c0 + 1, c2); +; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 1, 8 * c0 + 2, c2); +; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 1, 8 * c0 + 3, c2); +; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 1, 8 * c0 + 4, c2); +; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 1, 8 * c0 + 5, c2); +; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 1, 8 * c0 + 6, c2); +; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 1, 8 * c0 + 7, c2); +; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 2, 8 * c0, c2); +; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 2, 8 * c0 + 1, c2); +; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 2, 8 * c0 + 2, c2); +; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 2, 8 * c0 + 3, c2); +; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 2, 8 * c0 + 4, c2); +; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 2, 8 * c0 + 5, c2); +; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 2, 8 * c0 + 6, c2); +; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 2, 8 * c0 + 7, c2); +; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 3, 8 * c0, c2); +; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 3, 8 * c0 + 1, c2); +; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 3, 8 * c0 + 2, c2); +; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 3, 8 * c0 + 3, c2); +; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 3, 8 * c0 + 4, c2); +; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 3, 8 * c0 + 5, c2); +; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 3, 8 * c0 + 6, c2); +; CHECK-NEXT: Stmt_Copy_0(4 * c1 + 3, 8 * c0 + 7, c2); ; CHECK-NEXT: } ; CHECK-NEXT: } ; CHECK-NEXT: } @@ -84,11 +84,17 @@ ; EXTRACTION-OF-MACRO-KERNEL-NEXT: // 1st level tiling - Points ; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c2 = 0; c2 <= 31; c2 += 1) ; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c3 = 0; c3 <= 31; c3 += 1) -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb14(32 * c0 + c2, 32 * c1 + c3); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb9(32 * c0 + c2, 32 * c1 + c3); ; EXTRACTION-OF-MACRO-KERNEL-NEXT: } ; EXTRACTION-OF-MACRO-KERNEL-NEXT: // 1st level tiling - Tiles -; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c1 = 0; c1 <= 3; c1 += 1) +; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c1 = 0; c1 <= 3; c1 += 1) { +; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c3 = 0; c3 <= 1055; c3 += 1) +; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c4 = 256 * c1; c4 <= 256 * c1 + 255; c4 += 1) +; EXTRACTION-OF-MACRO-KERNEL-NEXT: CopyStmt_0(0, c3, c4); ; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c2 = 0; c2 <= 10; c2 += 1) { +; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c3 = 96 * c2; c3 <= 96 * c2 + 95; c3 += 1) +; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c5 = 256 * c1; c5 <= 256 * c1 + 255; c5 += 1) +; EXTRACTION-OF-MACRO-KERNEL-NEXT: CopyStmt_1(c3, 0, c5); ; EXTRACTION-OF-MACRO-KERNEL-NEXT: // 1st level tiling - Points ; EXTRACTION-OF-MACRO-KERNEL-NEXT: // Register tiling - Tiles ; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c3 = 0; c3 <= 131; c3 += 1) @@ -96,43 +102,44 @@ ; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c5 = 0; c5 <= 255; c5 += 1) { ; EXTRACTION-OF-MACRO-KERNEL-NEXT: // Register tiling - Points ; EXTRACTION-OF-MACRO-KERNEL-NEXT: // 1st level tiling - Tiles -; EXTRACTION-OF-MACRO-KERNEL-NEXT: // 1st level tiling - Points +; EXTRACTION-OF-MACRO-KERNEL-NEXT: // 1st level tiling - Points ; EXTRACTION-OF-MACRO-KERNEL-NEXT: { -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 8 * c3, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 1, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 2, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 3, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 4, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 5, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 6, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 7, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 1, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 2, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 3, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 4, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 5, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 6, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 7, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 1, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 2, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 3, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 4, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 5, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 6, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 7, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 1, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 2, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 3, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 4, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 5, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 6, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 7, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 1, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 2, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 4, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 5, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 6, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 7, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 1, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 2, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 4, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 5, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 6, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 7, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 1, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 2, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 4, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 5, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 6, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 7, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 1, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 2, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 4, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 5, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 6, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 7, 256 * c1 + c5); ; EXTRACTION-OF-MACRO-KERNEL-NEXT: } ; EXTRACTION-OF-MACRO-KERNEL-NEXT: } ; EXTRACTION-OF-MACRO-KERNEL-NEXT: } +; EXTRACTION-OF-MACRO-KERNEL-NEXT: } ; EXTRACTION-OF-MACRO-KERNEL-NEXT: } ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -142,59 +149,44 @@ bb: br label %bb8 -bb8: ; preds = %bb39, %bb - %tmp = phi i32 [ 0, %bb ], [ %tmp40, %bb39 ] - %tmp9 = icmp slt i32 %tmp, 1056 - br i1 %tmp9, label %bb10, label %bb41 +bb8: ; preds = %bb29, %bb + %tmp = phi i64 [ 0, %bb ], [ %tmp30, %bb29 ] + br label %bb9 -bb10: ; preds = %bb8 - br label %bb11 +bb9: ; preds = %bb26, %bb8 + %tmp10 = phi i64 [ 0, %bb8 ], [ %tmp27, %bb26 ] + %tmp11 = getelementptr inbounds [1056 x double], [1056 x double]* %arg5, i64 %tmp, i64 %tmp10 + %tmp12 = load double, double* %tmp11, align 8 + %tmp13 = fmul double %tmp12, %arg4 + store double %tmp13, double* %tmp11, align 8 + br label %Copy_0 -bb11: ; preds = %bb37, %bb10 - %tmp12 = phi i32 [ 0, %bb10 ], [ %tmp38, %bb37 ] - %tmp13 = icmp slt i32 %tmp12, 1056 - br i1 %tmp13, label %bb14, label %bb39 +Copy_0: ; preds = %Copy_0, %bb9 + %tmp15 = phi i64 [ 0, %bb9 ], [ %tmp24, %Copy_0 ] + %tmp16 = getelementptr inbounds [1024 x double], [1024 x double]* %arg6, i64 %tmp, i64 %tmp15 + %tmp17 = load double, double* %tmp16, align 8 + %tmp18 = fmul double %tmp17, %arg3 + %tmp19 = getelementptr inbounds [1056 x double], [1056 x double]* %arg7, i64 %tmp15, i64 %tmp10 + %tmp20 = load double, double* %tmp19, align 8 + %tmp21 = fmul double %tmp18, %tmp20 + %tmp22 = load double, double* %tmp11, align 8 + %tmp23 = fadd double %tmp22, %tmp21 + store double %tmp23, double* %tmp11, align 8 + %tmp24 = add nuw nsw i64 %tmp15, 1 + %tmp25 = icmp ne i64 %tmp24, 1024 + br i1 %tmp25, label %Copy_0, label %bb26 -bb14: ; preds = %bb11 - %tmp15 = sext i32 %tmp12 to i64 - %tmp16 = sext i32 %tmp to i64 - %tmp17 = getelementptr inbounds [1056 x double], [1056 x double]* %arg5, i64 %tmp16 - %tmp18 = getelementptr inbounds [1056 x double], [1056 x double]* %tmp17, i64 0, i64 %tmp15 - %tmp19 = load double, double* %tmp18, align 8 - %tmp20 = fmul double %tmp19, %arg4 - store double %tmp20, double* %tmp18, align 8 - br label %bb21 +bb26: ; preds = %Copy_0 + %tmp27 = add nuw nsw i64 %tmp10, 1 + %tmp28 = icmp ne i64 %tmp27, 1056 + br i1 %tmp28, label %bb9, label %bb29 -bb21: ; preds = %bb24, %bb14 - %tmp22 = phi i32 [ 0, %bb14 ], [ %tmp36, %bb24 ] - %tmp23 = icmp slt i32 %tmp22, 1024 - br i1 %tmp23, label %bb24, label %bb37 +bb29: ; preds = %bb26 + %tmp30 = add nuw nsw i64 %tmp, 1 + %tmp31 = icmp ne i64 %tmp30, 1056 + br i1 %tmp31, label %bb8, label %bb32 -bb24: ; preds = %bb21 - %tmp25 = sext i32 %tmp22 to i64 - %tmp26 = getelementptr inbounds [1024 x double], [1024 x double]* %arg6, i64 %tmp16 - %tmp27 = getelementptr inbounds [1024 x double], [1024 x double]* %tmp26, i64 0, i64 %tmp25 - %tmp28 = load double, double* %tmp27, align 8 - %tmp29 = fmul double %arg3, %tmp28 - %tmp30 = getelementptr inbounds [1056 x double], [1056 x double]* %arg7, i64 %tmp25 - %tmp31 = getelementptr inbounds [1056 x double], [1056 x double]* %tmp30, i64 0, i64 %tmp15 - %tmp32 = load double, double* %tmp31, align 8 - %tmp33 = fmul double %tmp29, %tmp32 - %tmp34 = load double, double* %tmp18, align 8 - %tmp35 = fadd double %tmp34, %tmp33 - store double %tmp35, double* %tmp18, align 8 - %tmp36 = add nsw i32 %tmp22, 1 - br label %bb21 - -bb37: ; preds = %bb21 - %tmp38 = add nsw i32 %tmp12, 1 - br label %bb11 - -bb39: ; preds = %bb11 - %tmp40 = add nsw i32 %tmp, 1 - br label %bb8 - -bb41: ; preds = %bb8 +bb32: ; preds = %bb29 ret void }