Index: polly/include/polly/Support/ISLTools.h =================================================================== --- polly/include/polly/Support/ISLTools.h +++ polly/include/polly/Support/ISLTools.h @@ -522,6 +522,11 @@ /// value. Otherwise, return NaN. isl::val getConstant(isl::pw_aff PwAff, bool Max, bool Min); +/// If the relation @p PwAff lies on a hyperplane where the given +/// dimension @p Dim has a fixed value, then return that value. +/// Otherwise return NaN. +isl::val getConstant(isl::map Map, isl::dim Dim, int Pos); + /// Check that @p End is valid and return an iterator from @p Begin to @p End /// /// Use case example: Index: polly/lib/Support/ISLTools.cpp =================================================================== --- polly/lib/Support/ISLTools.cpp +++ polly/lib/Support/ISLTools.cpp @@ -263,6 +263,18 @@ } } +isl::val polly::getConstant(isl::map Map, isl::dim Dim, int Pos) { + unsigned NumDims = unsignedFromIslSize(Map.dim(Dim)); + if (Pos < 0) + Pos = NumDims + Pos; + assert(unsigned(Pos) < NumDims && "Dimension index must be in range"); + // TODO: The isl_map_plain_get_val_if_fixed function is not robust, since its + // result is different depending on the internal representation. + // Replace it with a different implementation. + return isl::manage(isl_map_plain_get_val_if_fixed( + Map.get(), static_cast(Dim), Pos)); +} + isl::union_map polly::shiftDim(isl::union_map UMap, isl::dim Dim, int Pos, int Amount) { isl::union_map Result = isl::union_map::empty(UMap.ctx()); Index: polly/lib/Transform/MatmulOptimizer.cpp =================================================================== --- polly/lib/Transform/MatmulOptimizer.cpp +++ polly/lib/Transform/MatmulOptimizer.cpp @@ -13,10 +13,13 @@ #include "polly/ScopInfo.h" #include "polly/ScopPass.h" #include "polly/Simplify.h" +#include "polly/Support/GICHelper.h" #include "polly/Support/ISLTools.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/Sequence.h" +#include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/iterator_range.h" @@ -126,6 +129,24 @@ "macro-kernel, by Nr, the parameter of the micro-kernel"), cl::Hidden, cl::init(256), cl::ZeroOrMore, cl::cat(PollyCategory)); +static cl::opt + PMBasedTCOpts("polly-tc-opt", + cl::desc("Perform optimizations of tensor contractions based " + "on pattern matching"), + cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); + +static cl::opt + PMBasedMMMOpts("polly-matmul-opt", + cl::desc("Perform optimizations of matrix multiplications " + "based on pattern matching"), + cl::init(true), cl::ZeroOrMore, cl::cat(PollyCategory)); + +static cl::opt OptComputeOut( + "polly-tc-dependences-computeout", + cl::desc("Bound the dependence analysis by a maximal amount of " + "computational steps (0 means no bound)"), + cl::Hidden, cl::init(500000), cl::ZeroOrMore, cl::cat(PollyCategory)); + namespace { /// Parameters of the micro kernel. /// @@ -160,6 +181,59 @@ int k = -1; }; +/// Parameters of the tensor contraction operands. +/// +/// Parameters, which describe access relations that represent operands of the +/// tensor contraction. +struct TCInfoTy { + /// @{ + /// Memory accesses that represent reading from tensors, which are operands of + /// the tensor contraction. + MemoryAccess *A = nullptr; + MemoryAccess *B = nullptr; + /// @} + + /// @{ + /// Memory accesses that represent reading from and writing into the tensor, + /// which contains the result of the tensor contraction. + MemoryAccess *ReadFromC = nullptr; + MemoryAccess *WriteToC = nullptr; + /// @} + + /// @{ + /// Input dimensions of the schedule space, which represent free + /// indices of tensors. + SmallDenseSet I; + SmallDenseSet J; + /// @} + + /// @{ + /// Input dimension of the schedule space, which represents contracted + /// indices of tensors. + SmallDenseSet P; + /// @} + + /// @{ + /// Sizes of tensor dimensions for corresponding input dimensions of + /// the schedule space. The size of the tensor dimension can be larger than + /// the size of the corresponding input dimension of the schedule space. + /// This does not correspond to a tensor contraction. However, such a pattern + /// will be optimized by the transformation. + SmallVector DimensionSizes; + SmallVector ADimensions; + SmallVector BDimensions; + SmallVector CDimensions; + /// @} + + /// @{ + /// Permutations of indices of I, J, and P, which describe operands of + /// the tensor contraction and its result. + SmallVector OrderedI; + SmallVector OrderedJ; + SmallVector OrderedP; + /// @} +}; + /// Create an isl::union_set, which describes the option of the form /// [isolate[] -> unroll[x]]. /// @@ -1010,11 +1084,7 @@ MatMulInfoTy &MMI) { auto PartialSchedule = isl::manage( isl_schedule_node_band_get_partial_schedule_union_map(Node.get())); - Node = Node.child(0); - auto LeafType = isl_schedule_node_get_type(Node.get()); - Node = Node.parent(); - if (LeafType != isl_schedule_node_leaf || - isl_schedule_node_band_n_member(Node.get()) < 3 || + if (isl_schedule_node_band_n_member(Node.get()) < 3 || Node.get_schedule_depth().release() != 0 || isl_union_map_n_map(PartialSchedule.get()) != 1) return false; @@ -1024,14 +1094,618 @@ return false; } +/// Get the dimension size. +/// +/// Return the size of the dimension @p Pos, which is obtained from @p SAI. +/// Return -1 in the case of the first dimension of a multi-dimensional array, +/// since the ScopArrayInfo class does not carry size information. +/// +/// @param SAI The information about the array. +/// @param Pos The position of the dimension. +/// @return The size of the dimension. +static int getDimSize(const ScopArrayInfo *SAI, unsigned Pos) { + if (Pos == 0) + return -1; + const llvm::SCEV *SCEVDimSize = SAI->getDimensionSize(Pos); + assert(SCEVDimSize); + auto *ConstantDimSize = dyn_cast(SCEVDimSize); + assert(ConstantDimSize); + auto *IntDimSize = dyn_cast(ConstantDimSize->getValue()); + assert(IntDimSize); + return IntDimSize->getSExtValue(); +} + +/// Check whether the access relation has the specified form. +/// +/// Check that the access relation @p AccMap has the form T[I0, …, In], where +/// indexes I0, …, In are specified by @p Dimensions. +/// +/// @param Domain The domain of the access relation. +/// @param AccMap The access relation to be checked. +/// @param Dimensions The permutation of the subset of the input dimensions. +/// @return True if @p AccMap has the expected form and false, +/// otherwise. +static bool isCorrectAccessMap(isl::set Domain, isl::map AccMap, + ArrayRef Dimensions) { + isl::space Space = AccMap.get_space(); + if (unsignedFromIslSize(Space.dim(isl::dim::out)) != Dimensions.size()) + return false; + isl::map Universe = isl::map::universe(Space); + + // Create an access relation of the following form: + // [I0, …, Im] -> [Il, …, In], where indexes + // Il, …, In are specified by @p Dimensions. + isl::map PossibleTensor = isl::manage(Universe.copy()); + unsigned DimInSize = unsignedFromIslSize(Space.dim(isl::dim::in)); + for (unsigned i = 0; i < Dimensions.size(); i++) { + const int InPos = Dimensions[i]; + if ((InPos >= static_cast(DimInSize)) || (InPos < 0)) + return false; + PossibleTensor = + PossibleTensor.equate(isl::dim::in, InPos, isl::dim::out, i); + } + + AccMap = AccMap.intersect_domain(Domain); + PossibleTensor = PossibleTensor.intersect_domain(Domain); + + // If AccMap != PossibleTensor here (the two maps have been gisted at + // this point), it means that the writes are not complete, or in other + // words, it is a Partial write and Partial writes must be rejected. + return AccMap.is_equal(PossibleTensor); +} + +/// Check whether the access represents the tensor contraction operand. +/// +/// Check that the access relation @p AccMap has the form T[i1, …, in]. +/// Obtained indexes i1, …, in, their sizes and their permutation are stored +/// into @p IndexSet, @p DimensionSizes, and @p Dimensions, respectively. +/// +/// @param Domain The domain of the access relation. +/// @param AccMap The access relation to be checked. +/// @param IndexSet The subset of the input dimensions. +/// @param DimensionSizes Sizes of the input dimensions of @p Dimensions. +/// @param Dimensions The permutation of the subset of the input dimensions. +/// @return True if @p AccMap has the expected form and false, +/// otherwise. +static bool isTCOperandAcc(isl::set Domain, isl::map AccMap, + SmallDenseSet &IndexSet, + SmallVectorImpl &DimensionSizes, + SmallVectorImpl &Dimensions) { + isl::id Id = AccMap.get_tuple_id(isl::dim::out); + const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id); + assert(SAI && "AccMap should represent memory access"); + + // Fix values of output dimensions with respect to their positions. + // In the case of the tensor contraction, values of output dimensions are + // fixed and form a permutation of a subset of values of input dimensions. + // + // For example, in the case of Stmt[i][j][k] -> A[k][i], which represents + // the operand of the tensor contraction, we get the following map by fixing + // the output dimensions Stmt[1][j][0] -> A[0][1]. + // + // We store the permutation of the subset of the input dimensions {2, 0} into + // @p Dimensions. + // + // The obtained permutation and the isCorrectAccessMap function are used to + // check whether the access relation @p AccMap represents the tensor + // contraction operand. For example, in the case of + // Stmt[i][j][k] -> A[i-1][j+1], we get Stmt[1][0][k] -> A[0][1] and, + // consequently, {1, 0}, which is rejected by isCorrectAccessMap, + // since it corresponds to Stmt[i][j][k] -> A[j][i]. + isl::map CheckMap = isl::manage(AccMap.copy()); + unsigned OutDimNum = unsignedFromIslSize(CheckMap.dim(isl::dim::out)); + for (unsigned i = 0; i < OutDimNum; i++) + CheckMap = CheckMap.fix_si(isl::dim::out, i, i); + + // Try to obtain the permutation and sizes of corresponding input dimensions. + Dimensions.assign(OutDimNum, -1); + for (unsigned i : rangeIslSize(0, CheckMap.dim(isl::dim::in))) { + isl::val Val = getConstant(CheckMap, isl::dim::in, i); + if (!Val.is_int()) + continue; + int OutPos = -1; + llvm::APInt ValAPInt = APIntFromVal(Val); + if (ValAPInt.isSignedIntN(32)) + OutPos = ValAPInt.getSExtValue(); + if ((OutPos < 0) || (OutPos >= static_cast(OutDimNum)) || + IndexSet.count(i)) + return false; + IndexSet.insert(i); + Dimensions[OutPos] = i; + if (DimensionSizes[i] <= 0) + DimensionSizes[i] = getDimSize(SAI, OutPos); + } + + return isCorrectAccessMap(Domain, AccMap, Dimensions); +} + +/// Find the intersection of two sets. +/// +/// Find the intersection of the set @p A and the set @p B. +/// +/// @param A, B Sets to intersect. +/// @return The set intersection. +static SmallDenseSet intersect(const SmallDenseSet &A, + const SmallDenseSet &B) { + SmallDenseSet Intersection = A; + set_intersect(Intersection, B); + return Intersection; +} + +/// Check whether the set is a superset. +/// +/// Check that the set @p A is a superset of @p B. +/// +/// @param A, B Sets to be checked. +/// @return True if the set A is a superset of B. +static bool isSuperset(const SmallDenseSet &A, + const SmallDenseSet &B) { + return intersect(A, B).size() == B.size(); +} + +/// Find the union of two sets. +/// +/// Find the union of the set @p A and the set @p B. +/// +/// @param A, B Sets to unite. +/// @return The set union. +static SmallDenseSet unite(const SmallDenseSet &A, + const SmallDenseSet &B) { + SmallDenseSet Union = A; + set_union(Union, B); + return Union; +} + +/// Determine the access that writes to the tensor, which contains +/// the result of the tensor contraction. +/// +/// @param Domain The domain of the statement. +/// @param Stmt The statement, which writes to memory. +/// @param TCI The information about the tensor contraction. +/// @param IandJIndexSet The set, which contains free indexes of tensors. +/// @return The determined MemoryAccess, or nullptr if there is no necessary +/// access within the SCoP. +static MemoryAccess *getWriteAccess(isl::set Domain, ScopStmt *Stmt, + TCInfoTy &TCI, + SmallDenseSet &IandJIndexSet) { + TCI.WriteToC = nullptr; + SmallVector Accesses = getAccessesInOrder(*Stmt); + for (MemoryAccess *MemA : reverse(Accesses)) { + if (!MemA->isLatestArrayKind()) + continue; + // The last memory access should be a write memory access. + if (!MemA->isWrite()) + return nullptr; + + isl::map AccMap = MemA->getLatestAccessRelation(); + if (!isTCOperandAcc(Domain, AccMap, IandJIndexSet, TCI.DimensionSizes, + TCI.CDimensions)) + return nullptr; + + return MemA; + } + return nullptr; +} + +/// Determine an access, which reads elements of an operand of the tensor +/// contraction +/// +/// @param MemAccessPtr The access, which reads elements of the tensor. +/// @param IndexSet The set, which contains indexes of the tensors. +/// @param IandJIndexSet The set, which contains free indexes of tensors. +/// @param Dimensions The permutation of the subset of the input dimensions. +/// @param TCI The information about the tensor contraction. +/// @return True if the memory access @p MemAccessPtr corresponds +/// to the tensor contraction. +static bool setReadAccess(MemoryAccess *MemAccessPtr, + const SmallDenseSet &IndexSet, + const SmallDenseSet &IandJIndexSet, + SmallVector Dimensions, TCInfoTy &TCI) { + if (!TCI.A) { + // Probably IndexSet is a union of I and P sets. + if (!isSuperset(IndexSet, TCI.P)) + return false; + + // Obtain the set I. + TCI.I = std::move(set_difference(IndexSet, TCI.P)); + if (!isSuperset(IandJIndexSet, TCI.I)) + return false; + + // Obtain the set J. + TCI.J = std::move(set_difference(IandJIndexSet, TCI.I)); + + // Set the first operand of the tensor contraction. + TCI.A = MemAccessPtr; + TCI.ADimensions = std::move(Dimensions); + return true; + } + + if (!TCI.B) { + // IndexSet should be a union of J and P sets. + if (unite(TCI.P, TCI.J) != IndexSet) + return false; + + // Set the second operand of the tensor contraction. + TCI.B = MemAccessPtr; + TCI.BDimensions = std::move(Dimensions); + return true; + } + + return false; +} + +/// Check that all memory accesses of the statement, except from the last +/// one, are read memory accesses, which read elements of operands of the tensor +/// contraction and its result. +/// +/// @param Domain The domain of the statement. +/// @param Stmt The statement, which writes to memory. +/// @param TCI The information about the tensor contraction. +/// @param IandJIndexSet The set, which contains free indexes of tensors. +/// @return True if all read memory accesses of the statement @p Stmt correspond +/// to the tensor contraction. +static bool setReadAccesses(isl::set Domain, ScopStmt *Stmt, TCInfoTy &TCI, + SmallDenseSet &IandJIndexSet) { + TCI.A = nullptr; + TCI.B = nullptr; + TCI.ReadFromC = nullptr; + SmallVector Accesses = getAccessesInOrder(*Stmt); + for (auto *MemA = Accesses.begin(); *MemA != TCI.WriteToC; MemA++) { + MemoryAccess *MemAccessPtr = *MemA; + if (!MemAccessPtr->isLatestArrayKind()) + continue; + + // All memory accesses, except from the last one, should be read memory + // accesses. + if (MemAccessPtr->isWrite()) + return false; + + // There is only one memory access, which reads elements of the result of + // the tensor contraction. + isl::map AccMap = MemAccessPtr->getLatestAccessRelation(); + if (AccMap.is_equal(TCI.WriteToC->getLatestAccessRelation())) { + if (TCI.ReadFromC) + return false; + TCI.ReadFromC = MemAccessPtr; + continue; + } + + SmallVector Dimensions; + SmallDenseSet IndexSet; + if (!isTCOperandAcc(Domain, AccMap, IndexSet, TCI.DimensionSizes, + Dimensions)) + return false; + + if (!setReadAccess(MemAccessPtr, IndexSet, IandJIndexSet, Dimensions, TCI)) + return false; + } + + // Check that there are read memory accesses, which read elements of operands + // of the tensor contraction and its result. + return TCI.ReadFromC && TCI.A && TCI.B; +} + +/// Check accesses to operands of the tensor contraction. +/// +/// Check that accesses of the SCoP statement, which corresponds to +/// the partial schedule @p PartialSchedule, represent accesses +/// to the non-scalar operands of the tensor contraction. +/// +/// @param Domain The domain of the SCoP statement. +/// @param PartialSchedule The partial schedule of the SCoP statement. +/// @param TCI Parameters of the tensor contraction operands. +/// @return True if the corresponding SCoP statement +/// represents tensor contraction and false, +/// otherwise. +static bool containsOnlyTCAcc(isl::set Domain, isl::map PartialSchedule, + TCInfoTy &TCI) { + isl::id InputDimsId = PartialSchedule.get_tuple_id(isl::dim::in); + ScopStmt *Stmt = static_cast(InputDimsId.get_user()); + + // In region statements, the order of memory accesses execution is not + // predictable at compile-time. + if ((Stmt->size() <= 1) || Stmt->isRegionStmt()) + return false; + + unsigned DimNum = unsignedFromIslSize(PartialSchedule.dim(isl::dim::in)); + TCI.DimensionSizes.resize(DimNum); + SmallDenseSet IandJIndexSet; + + TCI.WriteToC = getWriteAccess(Domain, Stmt, TCI, IandJIndexSet); + if (!TCI.WriteToC) + return false; + + if (intersect(IandJIndexSet, TCI.P).size() != 0) + return false; + + if (!setReadAccesses(Domain, Stmt, TCI, IandJIndexSet)) + return false; + + return true; +} + +/// Check that dependency corresponds to the tensor contraction carried over +/// loop dimension @p Dim. +/// +/// Check that the dependency has the form +/// S(..., ki, max(k(i + 1)), ..., max(kn), ...) -> +/// S(..., ki + 1, min(k(i + 1)), ..., min(kn), ...), where S is the SCoP +/// statement. For this purpose, we analyze the set @p DepDelta, which +/// represents the differences between image elements and domain elements of +/// the corresponding map. +/// +/// @param DepDelta The set contains the differences between image elements +/// and corresponding domain elements of the map, which +/// represents the dependency. +/// @param Dim The position of the index ki. +/// @param BoundDeltas In the case of indexes of ki, the difference between +/// image elements and corresponding domain elements +/// corresponds to the difference between lexicographic +/// minimum and lexicographic maximum of the corresponding +/// dimension of the domain of the statement. +/// @param IndexSet Obtained indexes ki, which describe the dependency. +/// @return True if dependencies correspond to the tensor contraction +/// and false, otherwise. +static bool isReductionCarriedOverDim(isl::set DepDelta, unsigned Dim, + isl::pw_multi_aff BoundDeltas, + const SmallDenseSet &IndexSet) { + isl::space Space = DepDelta.get_space(); + isl::set Superset = isl::set::universe(Space); + for (unsigned i = 0; i < Dim; i += 1) + Superset = Superset.fix_si(isl::dim::set, i, 0); + Superset = Superset.fix_si(isl::dim::set, Dim, 1); + + // Check that the difference between the image element and the domain element + // is equal to one in the case of the index ki. Image elements and + // corresponding domain elements should be equal in the case of positions, + // which are lower than the specified position. + if (!DepDelta.is_subset(Superset)) + return false; + + // Compute a set, which is used to analyze how values of + // the domain are related to the map that describes the dependency. + isl_pw_multi_aff *DepDeltaPW = isl_pw_multi_aff_from_set(DepDelta.copy()); + BoundDeltas = BoundDeltas.add(isl::manage(DepDeltaPW)); + isl_set *ComplementRawSet = isl_set_from_pw_multi_aff(BoundDeltas.release()); + isl::set Complement = isl::manage(ComplementRawSet); + + for (unsigned i : rangeIslSize(Dim + 1, DepDelta.dim(isl::dim::set))) { + if (!IndexSet.count(i)) { + // Check the difference between the image element and the domain element + // in the case of indexes, which do not describe the dependency. + if (DepDelta.plain_get_val_if_fixed(isl::dim::set, i).is_zero()) + continue; + return false; + } + + // In the case of other indexes, which describe the dependency, + // the difference between the image element and the domain element + // should be equal to the difference between lexicographic minimum and + // lexicographic maximum of the domain of the statement. + if (!Complement.plain_get_val_if_fixed(isl::dim::set, i).is_zero()) + return false; + } + + return true; +} + +/// Check that dependencies correspond to the tensor contraction. +/// +/// Check that there are only true dependencies of the form +/// S(..., ki, max(k(i + 1)), ..., max(kn), ...) -> +/// S(..., ki + 1, min(k(i + 1)), ..., min(kn), ...), where S is the SCoP +/// statement represented by @p Schedule. Such dependencies are produced by +/// the tensor contraction. Obtained indexes ki are stored into @p IndexSet. +/// +/// The form of anti and output dependencies is specified implicitly by +/// the form the SCoP statement, which is checked by subsequent analysis. +/// +/// @param Schedule The schedule of the SCoP statement. +/// @param D The SCoP dependencies. +/// @param Domain The domain of the statement. +/// @param IndexSet Obtained indexes ki, which describe the dependencies. +/// @return True if dependencies correspond to the tensor contraction +/// and false, otherwise. +static bool containsOnlyTcDeps(isl::map Schedule, const Dependences *D, + SmallDenseSet &IndexSet, isl::set Domain) { + IslMaxOperationsGuard MaxOpGuard(Schedule.ctx().get(), OptComputeOut); + + isl::union_map Dep = + D->getDependences(Dependences::TYPE_RAW | Dependences::TYPE_RED); + + isl::space DomainSpace = Schedule.get_space().domain(); + isl::space Space = DomainSpace.map_from_domain_and_range(DomainSpace); + isl::set DepDeltas = Dep.extract_map(Space).deltas(); + isl::size DeltasDimNum = DepDeltas.dim(isl::dim::set); + isl::pw_multi_aff LowerBound = Domain.lexmin_pw_multi_aff(); + isl::pw_multi_aff BoundDeltas = Domain.lexmax_pw_multi_aff().sub(LowerBound); + + for (int i : reverse(rangeIslSize(0, DeltasDimNum))) { + // In the case of the tensor contraction, the difference between image + // elements and domain elements lies on a hyperplane where a dimension + // has the fixed value one. + isl::set Intersection = DepDeltas.fix_si(isl::dim::set, i, 1); + if (Intersection.is_empty()) + continue; + + if (!isReductionCarriedOverDim(Intersection, i, BoundDeltas, IndexSet)) + return false; + + IndexSet.insert(i); + DepDeltas = DepDeltas.subtract(Intersection); + } + + // In the case of the tensor contraction, all dependencies should have + // the previously described form. + if ((unsignedFromIslSize(DeltasDimNum) == 0) || !DepDeltas.is_empty()) + return false; + + return true; +} + +/// Check if the SCoP statement could probably be optimized with analytical +/// modeling. +/// +/// containsTC tries to determine whether the following conditions +/// are true: +/// +/// 1. The last memory access modeling an array, MA1, represents writing to +/// memory and has the form S(..., I, ..., J, ...) -> M(shuffle(I, J)), +/// where S is the SCoP statement under consideration and shuffle(I, J) +/// is a permutation of indexes of sets I and J. +/// 2. There are only true dependencies of the form +/// S(..., ki, max(k(i + 1)), ..., max(kn), ...) -> +/// S(..., ki + 1, min(k(i + 1)), ..., min(kn), ...), where S is the SCoP +/// statement represented by @p Schedule and ki are indexes of the set P. +/// 3. SCoP contains only three access relations, MA2, MA3, and MA4 that +/// represent reading from memory and have the form +/// S(..., I, ..., P, ...) -> M(shuffle(I, P)), +/// S(..., P, ..., J, ...) -> M(shuffle(J, P)), +/// S(...) -> M(shuffle(I, J)), respectively. +/// +/// @param PartialSchedule The PartialSchedule that contains a SCoP statement +/// to check. +/// @param D The SCoP dependencies. +/// @param TCI Parameters of the tensor contraction operands. +/// @param Domain The domain of the statement. +/// @return True if dependencies and memory accesses correspond to the tensor +/// contraction and false, otherwise. +static bool containsTCInfoTy(isl::map PartialSchedule, const Dependences *D, + TCInfoTy &TCI, isl::set Domain) { + if (!containsOnlyTcDeps(PartialSchedule, D, TCI.P, Domain)) + return false; + + // TODO: handle cases of scalar multiplication if needed. + if (TCI.P.size() == 0) + return false; + + if (!containsOnlyTCAcc(Domain, PartialSchedule, TCI)) + return false; + + // TODO: handle cases of GEMV if needed. + if ((TCI.I.size() == 0) || (TCI.J.size() == 0)) + return false; + + return true; +} + +/// Check if this node contains a partial schedule that could +/// probably be optimized with analytical modeling. +/// +/// isTCPattern performs the following steps to determine whether the SCoP +/// represents a TC-like kernel, which is a perfectly nested set of loops +/// with a data usage pattern that is similar to that produced by the tensor +/// contraction (for a detailed description please see [1]): +/// +/// 1. Checks that the node is innermost band node. +/// 2. Checks that the partial schedule contains only one statement. +/// 3. Check that all predecessors of the node contain all band nodes for +/// the statement. This corresponds to a straightforward implementation +/// of TC. +/// 4. Analyses the dependencies to determine contraction dimensions. +/// 5. Check that the last memory access modeling an array, represents writing +/// to the result of the TC-like kernel. +/// 6. Check that SCoP contains only three access relations that represent +/// reading of the operands of the TC-like kernel. +/// +/// [1] - Gareev R., Grosser T., Kruse M. High-Performance Generalized Tensor +/// Operations: A Compiler-Oriented Approach // ACM Transactions +/// Architecture and Code Optimization (TACO). 2018. +/// Vol. 15, no. 3. P. 34:1–34:27. DOI: 10.1145/3235029. +/// +/// If this is the case, we could use an approach that is similar to +/// the one used to get close-to-peak performance of matrix multiplications. +/// +/// @param Node The node to check. +/// @param D The SCoP dependencies. +/// @param TCI Parameters of the tensor contraction operands. +static bool isTCPattern(isl::schedule_node Node, const Dependences *D, + TCInfoTy &TCI) { + Node = Node.child(0); + isl::union_map PartialSchedule = Node.get_prefix_schedule_union_map(); + isl::union_set Domain = Node.domain(); + Node = Node.parent(); + + // The partial schedule should contain only one statement. + // TODO: This constraint should not be intrinsic to the algorithm. + if (isl_union_set_n_set(Domain.get()) != 1) + return false; + + bool HasFilterNode = false; + isl_schedule_node_type NodeType = isl_schedule_node_get_type(Node.get()); + + // Check that all predecessors of the node contain all band nodes + // for the statement, which represents the tensor contraction. + // This corresponds to a straightforward implementation of TC with/without + // DeLICM applied. + // + // For example, this covers the matrix multiplication pattern after a full + // run of -polly-optree and -polly-delicm, where the write access is not + // through the original memory access, but trough a PHI node that was + // delicmed. Subsequently, such band nodes will be replaced by a single band + // node. + // + // The corresponding schedule can be the following, where Stmt_for_body8 + // contains the matrix multiplication: + // + // domain: "{ Stmt_for_body8[i0, i1, i2] : 0 <= i0 <= 1599 and + // 0 <= i1 <= 1799 and + // 0 <= i2 <= 2199; + // Stmt_for_body3[i0, i1] : 0 <= i0 <= 1599 and + // 0 <= i1 <= 1799; + // Stmt_for_body3_last[i0, i1] : 0 <= i0 <= 1599 and + // 0 <= i1 <= 1799 }" + // child: + // sequence: + // - filter: "{ Stmt_for_body3[i0, i1] }" + // child: + // schedule: "[{ Stmt_for_body3[i0, i1] -> [(i0)] }, + // { Stmt_for_body3[i0, i1] -> [(i1)] }]" + // permutable: 1 + // coincident: [ 1, 1 ] + // - filter: "{ Stmt_for_body3_last[i0, i1] }" + // child: + // schedule: "[{ Stmt_for_body3_last[i0, i1] -> [(i0)] }, + // { Stmt_for_body3_last[i0, i1] -> [(i1)] }]" + // permutable: 1 + // coincident: [ 1, 1 ] + // - filter: "{ Stmt_for_body8[i0, i1, i2] }" + // child: + // schedule: "[{ Stmt_for_body8[i0, i1, i2] -> [(i0)] }, + // { Stmt_for_body8[i0, i1, i2] -> [(i1)] }, + // { Stmt_for_body8[i0, i1, i2] -> [(i2)] }]" + // permutable: 1 + // coincident: [ 1, 1, 0 ] + // + while (NodeType != isl_schedule_node_domain) { + if (HasFilterNode && (NodeType == isl_schedule_node_band)) + return false; + + if (NodeType == isl_schedule_node_filter) + HasFilterNode = true; + + Node = Node.parent(); + NodeType = isl_schedule_node_get_type(Node.get()); + } + + isl::map PartialScheduleMap = isl::map::from_union_map(PartialSchedule); + if (containsTCInfoTy(PartialScheduleMap, D, TCI, isl::set(Domain))) + return true; + + return false; +} + } // namespace isl::schedule_node polly::tryOptimizeMatMulPattern(isl::schedule_node Node, const llvm::TargetTransformInfo *TTI, const Dependences *D) { + TCInfoTy TCI; + if (PMBasedTCOpts && isTCPattern(Node, D, TCI)) + LLVM_DEBUG(dbgs() << "The tensor contraction pattern was detected\n"); MatMulInfoTy MMI; - if (isMatrMultPattern(Node, D, MMI)) { + if (PMBasedMMMOpts && isMatrMultPattern(Node, D, MMI)) { LLVM_DEBUG(dbgs() << "The matrix multiplication pattern was detected\n"); return optimizeMatMulPattern(Node, TTI, MMI); } Index: polly/lib/Transform/ScheduleOptimizer.cpp =================================================================== --- polly/lib/Transform/ScheduleOptimizer.cpp +++ polly/lib/Transform/ScheduleOptimizer.cpp @@ -302,6 +302,16 @@ /// @param Node The node to check. static bool isTileableBandNode(isl::schedule_node Node); + /// Check if this node is a band node we want to transform using pattern + /// matching. + /// + /// We look for innermost band nodes where individual dimensions are marked as + /// permutable. There is no restriction on the number of individual + /// dimensions. + /// + /// @param Node The node to check. + static bool isPMOptimizableBandNode(isl::schedule_node Node); + /// Pre-vectorizes one scheduling dimension of a schedule band. /// /// prevectSchedBand splits out the dimension DimToVectorize, tiles it and @@ -462,13 +472,23 @@ return true; } -bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node) { +/// Check if this node is a band node, which has only one child. +/// +/// @param Node The node to check. +static bool isOneTimeParentBandNode(isl::schedule_node Node) { if (isl_schedule_node_get_type(Node.get()) != isl_schedule_node_band) return false; if (isl_schedule_node_n_children(Node.get()) != 1) return false; + return true; +} + +bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node) { + if (!isOneTimeParentBandNode(Node)) + return false; + if (!isl_schedule_node_band_get_permutable(Node.get())) return false; @@ -480,6 +500,13 @@ return isSimpleInnermostBand(Node); } +bool ScheduleTreeOptimizer::isPMOptimizableBandNode(isl::schedule_node Node) { + if (!isOneTimeParentBandNode(Node)) + return false; + + return Node.child(0).isa(); +} + __isl_give isl::schedule_node ScheduleTreeOptimizer::applyTileBandOpt(isl::schedule_node Node) { if (FirstLevelTiling) { @@ -525,10 +552,8 @@ assert(OAI && "Expecting optimization options"); isl::schedule_node Node = isl::manage(NodeArg); - if (!isTileableBandNode(Node)) - return Node.release(); - if (OAI->PatternOpts) { + if (OAI->PatternOpts && isPMOptimizableBandNode(Node)) { isl::schedule_node PatternOptimizedSchedule = tryOptimizeMatMulPattern(Node, OAI->TTI, OAI->D); if (!PatternOptimizedSchedule.is_null()) { @@ -537,6 +562,9 @@ } } + if (!isTileableBandNode(Node)) + return Node.release(); + if (OAI->Postopts) Node = applyTileBandOpt(Node); Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll =================================================================== --- polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll @@ -1,7 +1,7 @@ ; RUN: opt %loadPolly \ ; RUN: -polly-pattern-matching-based-opts=true \ ; RUN: -polly-optree -polly-delicm -polly-simplify \ -; RUN: -polly-opt-isl -debug < %s 2>&1 \ +; RUN: -polly-opt-isl -polly-tc-opt=true -debug < %s 2>&1 \ ; RUN: | FileCheck %s ; REQUIRES: asserts @@ -10,6 +10,36 @@ ; is not through the original memory access, but trough a PHI node that was ; delicmed. This test covers the polybench 2mm and 3mm cases. ; +; This test case generates the following schedule, which contains filters: +; +; domain: "{ Stmt_for_body8[i0, i1, i2] : 0 <= i0 <= 1599 and +; 0 <= i1 <= 1799 and +; 0 <= i2 <= 2199; +; Stmt_for_body3[i0, i1] : 0 <= i0 <= 1599 and +; 0 <= i1 <= 1799; +; Stmt_for_body3_last[i0, i1] : 0 <= i0 <= 1599 and +; 0 <= i1 <= 1799 }" +; child: +; sequence: +; - filter: "{ Stmt_for_body3[i0, i1] }" +; child: +; schedule: "[{ Stmt_for_body3[i0, i1] -> [(i0)] }, { Stmt_for_body3[i0, i1] -> [(i1)] }]" +; permutable: 1 +; coincident: [ 1, 1 ] +; - filter: "{ Stmt_for_body3_last[i0, i1] }" +; child: +; schedule: "[{ Stmt_for_body3_last[i0, i1] -> [(i0)] }, { Stmt_for_body3_last[i0, i1] -> [(i1)] }]" +; permutable: 1 +; coincident: [ 1, 1 ] +; - filter: "{ Stmt_for_body8[i0, i1, i2] }" +; child: +; schedule: "[{ Stmt_for_body8[i0, i1, i2] -> [(i0)] }, +; { Stmt_for_body8[i0, i1, i2] -> [(i1)] }, +; { Stmt_for_body8[i0, i1, i2] -> [(i2)] }]" +; permutable: 1 +; coincident: [ 1, 1, 0 ] +; +; CHECK: The tensor contraction pattern was detected ; CHECK: The matrix multiplication pattern was detected ; Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll =================================================================== --- /dev/null +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll @@ -0,0 +1,108 @@ +; RUN: opt %loadPolly -polly-delicm -polly-simplify -polly-opt-isl \ +; RUN: -polly-pattern-matching-based-opts=true \ +; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s +; REQUIRES: asserts +; +; Check that the pattern matching detects the tensor contraction pattern +; after a full run of -polly-delicm. This test case generates the following +; schedule, which contans two band nodes. Without DeLICM two statement are +; generated. +; +; domain: "{ Stmt5[i0, i1, i2, i3, i4, i5] : 0 <= i0 <= 31 and 0 <= i1 <= 31 and +; 0 <= i2 <= 31 and 0 <= i3 <= 31 and +; 0 <= i4 <= 31 and 0 <= i5 <= 31 }" +; child: +; schedule: "[{ Stmt5[i0, i1, i2, i3, i4, i5] -> [(i0)] }, +; { Stmt5[i0, i1, i2, i3, i4, i5] -> [(i1)] }, +; { Stmt5[i0, i1, i2, i3, i4, i5] -> [(i2)] }, +; { Stmt5[i0, i1, i2, i3, i4, i5] -> [(i4)] }, +; { Stmt5[i0, i1, i2, i3, i4, i5] -> [(i3)] }]" +; permutable: 1 +; coincident: [ 1, 1, 1, 1, 0 ] +; child: +; schedule: "[{ Stmt5[i0, i1, i2, i3, i4, i5] -> [(i5)] }]" +; permutable: 1 +; child: +; leaf +; +; for (i = 0; i < 32; i++) +; for (j = 0; j < 32; j++) +; for (k = 0; k < 32; ++k) +; for (l = 0; l < 32; ++l) +; for (w = 0; w < 32; ++w) +; for (q = 0; q < 32; ++q) +; C[i][j][k][w] += A[i][l][j][q] * B[q][w][l][k]; +; +; CHECK: The tensor contraction pattern was detected +; +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define internal fastcc void @kernel_tc([32 x [32 x [32 x double]]]* nocapture %C, [32 x [32 x [32 x double]]]* nocapture readonly %A, [32 x [32 x [32 x double]]]* nocapture readonly %B) { +entry: + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.inc50, %entry + %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for.inc50 ] + br label %for.cond4.preheader + +for.cond4.preheader: ; preds = %for.inc47, %for.cond1.preheader + %indvars.iv16 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next17, %for.inc47 ] + br label %for.cond7.preheader + +for.cond7.preheader: ; preds = %for.inc44, %for.cond4.preheader + %indvars.iv13 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next14, %for.inc44 ] + br label %for.cond10.preheader + +for.cond10.preheader: ; preds = %for.inc41, %for.cond7.preheader + %indvars.iv10 = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next11, %for.inc41 ] + br label %for.cond13.preheader + +for.cond13.preheader: ; preds = %for.inc38, %for.cond10.preheader + %indvars.iv7 = phi i64 [ 0, %for.cond10.preheader ], [ %indvars.iv.next8, %for.inc38 ] + %arrayidx37 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %C, i64 %indvars.iv19, i64 %indvars.iv16, i64 %indvars.iv13, i64 %indvars.iv7 + %.pre = load double, double* %arrayidx37, align 8 + br label %for.body15 + +for.body15: ; preds = %for.body15, %for.cond13.preheader + %i = phi double [ %.pre, %for.cond13.preheader ], [ %add, %for.body15 ] + %indvars.iv = phi i64 [ 0, %for.cond13.preheader ], [ %indvars.iv.next, %for.body15 ] + %arrayidx21 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %A, i64 %indvars.iv19, i64 %indvars.iv10, i64 %indvars.iv16, i64 %indvars.iv + %i1 = load double, double* %arrayidx21, align 8 + %arrayidx29 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %B, i64 %indvars.iv, i64 %indvars.iv7, i64 %indvars.iv10, i64 %indvars.iv13 + %i2 = load double, double* %arrayidx29, align 8 + %mul = fmul fast double %i2, %i1 + %add = fadd fast double %i, %mul + store double %add, double* %arrayidx37, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 32 + br i1 %exitcond.not, label %for.inc38, label %for.body15 + +for.inc38: ; preds = %for.body15 + %indvars.iv.next8 = add nuw nsw i64 %indvars.iv7, 1 + %exitcond9.not = icmp eq i64 %indvars.iv.next8, 32 + br i1 %exitcond9.not, label %for.inc41, label %for.cond13.preheader + +for.inc41: ; preds = %for.inc38 + %indvars.iv.next11 = add nuw nsw i64 %indvars.iv10, 1 + %exitcond12.not = icmp eq i64 %indvars.iv.next11, 32 + br i1 %exitcond12.not, label %for.inc44, label %for.cond10.preheader + +for.inc44: ; preds = %for.inc41 + %indvars.iv.next14 = add nuw nsw i64 %indvars.iv13, 1 + %exitcond15.not = icmp eq i64 %indvars.iv.next14, 32 + br i1 %exitcond15.not, label %for.inc47, label %for.cond7.preheader + +for.inc47: ; preds = %for.inc44 + %indvars.iv.next17 = add nuw nsw i64 %indvars.iv16, 1 + %exitcond18.not = icmp eq i64 %indvars.iv.next17, 32 + br i1 %exitcond18.not, label %for.inc50, label %for.cond4.preheader + +for.inc50: ; preds = %for.inc47 + %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1 + %exitcond21.not = icmp eq i64 %indvars.iv.next20, 32 + br i1 %exitcond21.not, label %for.end52, label %for.cond1.preheader + +for.end52: ; preds = %for.inc50 + ret void +} Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll =================================================================== --- polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll @@ -1,6 +1,7 @@ -; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=false -debug < %s 2>&1 | FileCheck %s +; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=false \ +; RUN: -debug -polly-tc-opt < %s 2>&1 | FileCheck %s ; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -debug < %s 2>&1 | FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS -; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -polly-ast-detect-parallel -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=PARALLEL-AST +; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -polly-ast-detect-parallel -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=PARALLEL-AST ; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -stats -disable-output < %s 2>&1 | FileCheck %s --check-prefix=STATS -match-full-lines ; REQUIRES: asserts ; @@ -14,6 +15,7 @@ ; } ; ; CHECK-NOT: The matrix multiplication pattern was detected +; CHECK-NOT: The tensor contraction pattern was detected ; PATTERN-MATCHING-OPTS: The matrix multiplication pattern was detected ; PARALLEL-AST: #pragma known-parallel ; PARALLEL-AST: #pragma known-parallel Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll =================================================================== --- polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll @@ -8,7 +8,8 @@ ; RUN: -polly-target-1st-cache-level-size=32768 \ ; RUN: -polly-target-vector-register-bitwidth=256 \ ; RUN: -polly-target-2nd-cache-level-size=262144 \ -; RUN: -polly-opt-isl -debug < %s 2>&1 \ +; RUN: -polly-opt-isl -debug \ +; RUN: -polly-tc-opt=true < %s 2>&1 \ ; RUN: | FileCheck %s ; REQUIRES: asserts ; @@ -16,6 +17,7 @@ ; in case scalar memory accesses were replaced by accesses to newly created ; arrays. ; +; CHECK: The tensor contraction pattern was detected ; CHECK: The matrix multiplication pattern was detected ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll =================================================================== --- polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll @@ -1,5 +1,6 @@ ; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \ -; RUN: -debug-only=polly-opt-isl -disable-output < %s 2>&1 | FileCheck %s +; RUN: -debug-only=polly-opt-isl -disable-output \ +; RUN: -polly-tc-opt=true < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; ; for (i = 0; i < _PB_NI; i++) @@ -14,6 +15,7 @@ ; } ; ; CHECK-NOT: The matrix multiplication pattern was detected +; CHECK-NOT: The tensor contraction pattern was detected target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll =================================================================== --- /dev/null +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll @@ -0,0 +1,64 @@ +; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \ +; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s +; REQUIRES: asserts +; +; for (i = 0; i < 1024; i++) +; for (j = 0; j < 1024; j++) +; for (l = 0; l < 64; ++l) +; for (w = 0; w < 64; ++w) +; C[i][j] += A[i][l][w] * B[w][j][l]; +; +; CHECK: The tensor contraction pattern was detected +; +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define internal void @kernel_tc(i32 %ni, i32 %nj, i32 %nl, i32 %nq, i32 %nw, double %alpha, double %beta, [1024 x double]* %C, [64 x [64 x double]]* %A, [1024 x [64 x double]]* %B) { +entry: + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.inc30, %entry + %indvars.iv43 = phi i64 [ 0, %entry ], [ %indvars.iv.next44, %for.inc30 ] + br label %for.cond4.preheader + +for.cond4.preheader: ; preds = %for.inc27, %for.cond1.preheader + %indvars.iv40 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next41, %for.inc27 ] + br label %for.cond7.preheader + +for.cond7.preheader: ; preds = %for.inc24, %for.cond4.preheader + %indvars.iv37 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next38, %for.inc24 ] + br label %for.body9 + +for.body9: ; preds = %for.body9, %for.cond7.preheader + %indvars.iv = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next, %for.body9 ] + %arrayidx13 = getelementptr inbounds [64 x [64 x double]], [64 x [64 x double]]* %A, i64 %indvars.iv43, i64 %indvars.iv37, i64 %indvars.iv + %i = load double, double* %arrayidx13, align 8 + %arrayidx19 = getelementptr inbounds [1024 x [64 x double]], [1024 x [64 x double]]* %B, i64 %indvars.iv, i64 %indvars.iv40, i64 %indvars.iv37 + %i1 = load double, double* %arrayidx19, align 8 + %mul = fmul fast double %i1, %i + %arrayidx23 = getelementptr inbounds [1024 x double], [1024 x double]* %C, i64 %indvars.iv43, i64 %indvars.iv40 + %i2 = load double, double* %arrayidx23, align 8 + %add = fadd fast double %i2, %mul + store double %add, double* %arrayidx23, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 64 + br i1 %exitcond, label %for.body9, label %for.inc24 + +for.inc24: ; preds = %for.body9 + %indvars.iv.next38 = add nuw nsw i64 %indvars.iv37, 1 + %exitcond39 = icmp ne i64 %indvars.iv.next38, 64 + br i1 %exitcond39, label %for.cond7.preheader, label %for.inc27 + +for.inc27: ; preds = %for.inc24 + %indvars.iv.next41 = add nuw nsw i64 %indvars.iv40, 1 + %exitcond42 = icmp ne i64 %indvars.iv.next41, 1024 + br i1 %exitcond42, label %for.cond4.preheader, label %for.inc30 + +for.inc30: ; preds = %for.inc27 + %indvars.iv.next44 = add nuw nsw i64 %indvars.iv43, 1 + %exitcond45 = icmp ne i64 %indvars.iv.next44, 1024 + br i1 %exitcond45, label %for.cond1.preheader, label %for.end32 + +for.end32: ; preds = %for.inc30 + ret void +} Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll =================================================================== --- /dev/null +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll @@ -0,0 +1,64 @@ +; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \ +; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s +; REQUIRES: asserts +; +; for (i = 0; i < 32; i++) +; for (j = 0; j < 1024; j++) +; for (k = 0; k < 32; ++k) +; for (l = 0; l < 1024; ++l) +; C[i][j][k] += A[i][k][l] * B[l][j]; +; +; CHECK: The tensor contraction pattern was detected +; +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define internal void @kernel_tc(i32 %ni, i32 %nj, i32 %nk, i32 %nl, double %alpha, double %beta, [1024 x [32 x double]]* %C, [32 x [1024 x double]]* %A, [1024 x double]* %B) { +entry: + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.inc30, %entry + %indvars.iv43 = phi i64 [ 0, %entry ], [ %indvars.iv.next44, %for.inc30 ] + br label %for.cond4.preheader + +for.cond4.preheader: ; preds = %for.inc27, %for.cond1.preheader + %indvars.iv40 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next41, %for.inc27 ] + br label %for.cond7.preheader + +for.cond7.preheader: ; preds = %for.inc24, %for.cond4.preheader + %indvars.iv37 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next38, %for.inc24 ] + br label %for.body9 + +for.body9: ; preds = %for.body9, %for.cond7.preheader + %indvars.iv = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next, %for.body9 ] + %arrayidx13 = getelementptr inbounds [32 x [1024 x double]], [32 x [1024 x double]]* %A, i64 %indvars.iv43, i64 %indvars.iv37, i64 %indvars.iv + %i = load double, double* %arrayidx13, align 8 + %arrayidx17 = getelementptr inbounds [1024 x double], [1024 x double]* %B, i64 %indvars.iv, i64 %indvars.iv40 + %i1 = load double, double* %arrayidx17, align 8 + %mul = fmul fast double %i1, %i + %arrayidx23 = getelementptr inbounds [1024 x [32 x double]], [1024 x [32 x double]]* %C, i64 %indvars.iv43, i64 %indvars.iv40, i64 %indvars.iv37 + %i2 = load double, double* %arrayidx23, align 8 + %add = fadd fast double %i2, %mul + store double %add, double* %arrayidx23, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.body9, label %for.inc24 + +for.inc24: ; preds = %for.body9 + %indvars.iv.next38 = add nuw nsw i64 %indvars.iv37, 1 + %exitcond39 = icmp ne i64 %indvars.iv.next38, 32 + br i1 %exitcond39, label %for.cond7.preheader, label %for.inc27 + +for.inc27: ; preds = %for.inc24 + %indvars.iv.next41 = add nuw nsw i64 %indvars.iv40, 1 + %exitcond42 = icmp ne i64 %indvars.iv.next41, 1024 + br i1 %exitcond42, label %for.cond4.preheader, label %for.inc30 + +for.inc30: ; preds = %for.inc27 + %indvars.iv.next44 = add nuw nsw i64 %indvars.iv43, 1 + %exitcond45 = icmp ne i64 %indvars.iv.next44, 32 + br i1 %exitcond45, label %for.cond1.preheader, label %for.end32 + +for.end32: ; preds = %for.inc30 + ret void +} Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll =================================================================== --- /dev/null +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll @@ -0,0 +1,84 @@ +; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \ +; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s +; REQUIRES: asserts +; +; for (i = 0; i < 32; i++) +; for (j = 0; j < 32; j++) +; for (k = 0; k < 32; ++k) +; for (l = 0; l < 32; ++l) +; for (w = 0; w < 32; ++w) +; for (q = 0; q < 32; ++q) +; C[i][j][k][w] += A[i][l][j][q] * B[q][w][l][k]; +; +; CHECK: The tensor contraction pattern was detected +; +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define internal void @kernel_tc(i32 %ni, i32 %nj, i32 %nk, i32 %nl, i32 %nq, i32 %nw, double %alpha, double %beta, [32 x [32 x [32 x double]]]* %C, [32 x [32 x [32 x double]]]* %A, [32 x [32 x [32 x double]]]* %B) { +entry: + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.inc50, %entry + %indvars.iv71 = phi i64 [ 0, %entry ], [ %indvars.iv.next72, %for.inc50 ] + br label %for.cond4.preheader + +for.cond4.preheader: ; preds = %for.inc47, %for.cond1.preheader + %indvars.iv68 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next69, %for.inc47 ] + br label %for.cond7.preheader + +for.cond7.preheader: ; preds = %for.inc44, %for.cond4.preheader + %indvars.iv65 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next66, %for.inc44 ] + br label %for.cond10.preheader + +for.cond10.preheader: ; preds = %for.inc41, %for.cond7.preheader + %indvars.iv62 = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next63, %for.inc41 ] + br label %for.cond13.preheader + +for.cond13.preheader: ; preds = %for.inc38, %for.cond10.preheader + %indvars.iv59 = phi i64 [ 0, %for.cond10.preheader ], [ %indvars.iv.next60, %for.inc38 ] + br label %for.body15 + +for.body15: ; preds = %for.body15, %for.cond13.preheader + %indvars.iv = phi i64 [ 0, %for.cond13.preheader ], [ %indvars.iv.next, %for.body15 ] + %arrayidx21 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %A, i64 %indvars.iv71, i64 %indvars.iv62, i64 %indvars.iv68, i64 %indvars.iv + %i = load double, double* %arrayidx21, align 8 + %arrayidx29 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %B, i64 %indvars.iv, i64 %indvars.iv59, i64 %indvars.iv62, i64 %indvars.iv65 + %i1 = load double, double* %arrayidx29, align 8 + %mul = fmul fast double %i1, %i + %arrayidx37 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %C, i64 %indvars.iv71, i64 %indvars.iv68, i64 %indvars.iv65, i64 %indvars.iv59 + %i2 = load double, double* %arrayidx37, align 8 + %add = fadd fast double %i2, %mul + store double %add, double* %arrayidx37, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 32 + br i1 %exitcond, label %for.body15, label %for.inc38 + +for.inc38: ; preds = %for.body15 + %indvars.iv.next60 = add nuw nsw i64 %indvars.iv59, 1 + %exitcond61 = icmp ne i64 %indvars.iv.next60, 32 + br i1 %exitcond61, label %for.cond13.preheader, label %for.inc41 + +for.inc41: ; preds = %for.inc38 + %indvars.iv.next63 = add nuw nsw i64 %indvars.iv62, 1 + %exitcond64 = icmp ne i64 %indvars.iv.next63, 32 + br i1 %exitcond64, label %for.cond10.preheader, label %for.inc44 + +for.inc44: ; preds = %for.inc41 + %indvars.iv.next66 = add nuw nsw i64 %indvars.iv65, 1 + %exitcond67 = icmp ne i64 %indvars.iv.next66, 32 + br i1 %exitcond67, label %for.cond7.preheader, label %for.inc47 + +for.inc47: ; preds = %for.inc44 + %indvars.iv.next69 = add nuw nsw i64 %indvars.iv68, 1 + %exitcond70 = icmp ne i64 %indvars.iv.next69, 32 + br i1 %exitcond70, label %for.cond4.preheader, label %for.inc50 + +for.inc50: ; preds = %for.inc47 + %indvars.iv.next72 = add nuw nsw i64 %indvars.iv71, 1 + %exitcond73 = icmp ne i64 %indvars.iv.next72, 32 + br i1 %exitcond73, label %for.cond1.preheader, label %for.end52 + +for.end52: ; preds = %for.inc50 + ret void +} Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll =================================================================== --- /dev/null +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll @@ -0,0 +1,84 @@ +; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \ +; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s +; REQUIRES: asserts +; +; for (i = 0; i < 8; i++) +; for (j = 0; j < 8; j++) +; for (k = 0; k < 4; ++k) +; for (l = 0; l < 1024; ++l) +; for (w = 0; w < 1024; ++w) +; for (q = 0; q < 4; ++q) +; C[i][j][k][w][q] += A[q][k][j][l][i] * B[l][w]; +; +; CHECK: The tensor contraction pattern was detected +; +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define internal void @kernel_tc([8 x [4 x [1024 x [4 x double]]]]* %C, [4 x [8 x [1024 x [8 x double]]]]* %A, [1024 x double]* %B) { +entry: + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.inc50, %entry + %indvars.iv71 = phi i64 [ 0, %entry ], [ %indvars.iv.next72, %for.inc50 ] + br label %for.cond4.preheader + +for.cond4.preheader: ; preds = %for.inc47, %for.cond1.preheader + %indvars.iv68 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next69, %for.inc47 ] + br label %for.cond7.preheader + +for.cond7.preheader: ; preds = %for.inc44, %for.cond4.preheader + %indvars.iv65 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next66, %for.inc44 ] + br label %for.cond10.preheader + +for.cond10.preheader: ; preds = %for.inc41, %for.cond7.preheader + %indvars.iv62 = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next63, %for.inc41 ] + br label %for.cond13.preheader + +for.cond13.preheader: ; preds = %for.inc38, %for.cond10.preheader + %indvars.iv59 = phi i64 [ 0, %for.cond10.preheader ], [ %indvars.iv.next60, %for.inc38 ] + br label %for.body15 + +for.body15: ; preds = %for.body15, %for.cond13.preheader + %indvars.iv = phi i64 [ 0, %for.cond13.preheader ], [ %indvars.iv.next, %for.body15 ] + %arrayidx23 = getelementptr inbounds [4 x [8 x [1024 x [8 x double]]]], [4 x [8 x [1024 x [8 x double]]]]* %A, i64 %indvars.iv, i64 %indvars.iv65, i64 %indvars.iv68, i64 %indvars.iv62, i64 %indvars.iv71 + %i = load double, double* %arrayidx23, align 8 + %arrayidx27 = getelementptr inbounds [1024 x double], [1024 x double]* %B, i64 %indvars.iv62, i64 %indvars.iv59 + %i1 = load double, double* %arrayidx27, align 8 + %mul = fmul fast double %i1, %i + %arrayidx37 = getelementptr inbounds [8 x [4 x [1024 x [4 x double]]]], [8 x [4 x [1024 x [4 x double]]]]* %C, i64 %indvars.iv71, i64 %indvars.iv68, i64 %indvars.iv65, i64 %indvars.iv59, i64 %indvars.iv + %i2 = load double, double* %arrayidx37, align 8 + %add = fadd fast double %i2, %mul + store double %add, double* %arrayidx37, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 4 + br i1 %exitcond, label %for.body15, label %for.inc38 + +for.inc38: ; preds = %for.body15 + %indvars.iv.next60 = add nuw nsw i64 %indvars.iv59, 1 + %exitcond61 = icmp ne i64 %indvars.iv.next60, 1024 + br i1 %exitcond61, label %for.cond13.preheader, label %for.inc41 + +for.inc41: ; preds = %for.inc38 + %indvars.iv.next63 = add nuw nsw i64 %indvars.iv62, 1 + %exitcond64 = icmp ne i64 %indvars.iv.next63, 1024 + br i1 %exitcond64, label %for.cond10.preheader, label %for.inc44 + +for.inc44: ; preds = %for.inc41 + %indvars.iv.next66 = add nuw nsw i64 %indvars.iv65, 1 + %exitcond67 = icmp ne i64 %indvars.iv.next66, 4 + br i1 %exitcond67, label %for.cond7.preheader, label %for.inc47 + +for.inc47: ; preds = %for.inc44 + %indvars.iv.next69 = add nuw nsw i64 %indvars.iv68, 1 + %exitcond70 = icmp ne i64 %indvars.iv.next69, 8 + br i1 %exitcond70, label %for.cond4.preheader, label %for.inc50 + +for.inc50: ; preds = %for.inc47 + %indvars.iv.next72 = add nuw nsw i64 %indvars.iv71, 1 + %exitcond73 = icmp ne i64 %indvars.iv.next72, 8 + br i1 %exitcond73, label %for.cond1.preheader, label %for.end52 + +for.end52: ; preds = %for.inc50 + ret void +} Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll =================================================================== --- polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -debug < %s 2>&1 | FileCheck %s +; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \ +; RUN: -polly-tc-opt=true -debug < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; ; /* C := alpha*A*B + beta*C */ @@ -15,6 +16,7 @@ ; after the interchanging of loops. ; ; CHECK-NOT: The matrix multiplication pattern was detected +; CHECK-NOT: The tensor contraction pattern was detected ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown" Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll =================================================================== --- /dev/null +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll @@ -0,0 +1,94 @@ +; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \ +; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s +; REQUIRES: asserts +; +; for (i = 0; i < 16; i++) +; for (j = 0; j < 16; j++) +; for (k = 0; k < 8; ++k) +; for (l = 0; l < 1024; ++l) +; for (w = 0; w < 8; ++w) +; for (q = 0; q < 8; ++q) +; for (x = 0; x < 8; ++x) +; C[i][j][k][w][q][x] += A[l][x][j][k] * B[w][q][l][i]; +; +; CHECK: The tensor contraction pattern was detected +; +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define internal void @kernel_tc([16 x [8 x [8 x [8 x [8 x double]]]]]* %C, [8 x [16 x [8 x double]]]* %A, [8 x [1024 x [16 x double]]]* %B) { +entry: + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.inc60, %entry + %indvars.iv85 = phi i64 [ 0, %entry ], [ %indvars.iv.next86, %for.inc60 ] + br label %for.cond4.preheader + +for.cond4.preheader: ; preds = %for.inc57, %for.cond1.preheader + %indvars.iv82 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next83, %for.inc57 ] + br label %for.cond7.preheader + +for.cond7.preheader: ; preds = %for.inc54, %for.cond4.preheader + %indvars.iv79 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next80, %for.inc54 ] + br label %for.cond10.preheader + +for.cond10.preheader: ; preds = %for.inc51, %for.cond7.preheader + %indvars.iv76 = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next77, %for.inc51 ] + br label %for.cond13.preheader + +for.cond13.preheader: ; preds = %for.inc48, %for.cond10.preheader + %indvars.iv73 = phi i64 [ 0, %for.cond10.preheader ], [ %indvars.iv.next74, %for.inc48 ] + br label %for.cond16.preheader + +for.cond16.preheader: ; preds = %for.inc45, %for.cond13.preheader + %indvars.iv70 = phi i64 [ 0, %for.cond13.preheader ], [ %indvars.iv.next71, %for.inc45 ] + br label %for.body18 + +for.body18: ; preds = %for.body18, %for.cond16.preheader + %indvars.iv = phi i64 [ 0, %for.cond16.preheader ], [ %indvars.iv.next, %for.body18 ] + %arrayidx24 = getelementptr inbounds [8 x [16 x [8 x double]]], [8 x [16 x [8 x double]]]* %A, i64 %indvars.iv76, i64 %indvars.iv, i64 %indvars.iv82, i64 %indvars.iv79 + %i = load double, double* %arrayidx24, align 8 + %arrayidx32 = getelementptr inbounds [8 x [1024 x [16 x double]]], [8 x [1024 x [16 x double]]]* %B, i64 %indvars.iv73, i64 %indvars.iv70, i64 %indvars.iv76, i64 %indvars.iv85 + %i1 = load double, double* %arrayidx32, align 8 + %mul = fmul fast double %i1, %i + %arrayidx44 = getelementptr inbounds [16 x [8 x [8 x [8 x [8 x double]]]]], [16 x [8 x [8 x [8 x [8 x double]]]]]* %C, i64 %indvars.iv85, i64 %indvars.iv82, i64 %indvars.iv79, i64 %indvars.iv73, i64 %indvars.iv70, i64 %indvars.iv + %i2 = load double, double* %arrayidx44, align 8 + %add = fadd fast double %i2, %mul + store double %add, double* %arrayidx44, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 8 + br i1 %exitcond, label %for.body18, label %for.inc45 + +for.inc45: ; preds = %for.body18 + %indvars.iv.next71 = add nuw nsw i64 %indvars.iv70, 1 + %exitcond72 = icmp ne i64 %indvars.iv.next71, 8 + br i1 %exitcond72, label %for.cond16.preheader, label %for.inc48 + +for.inc48: ; preds = %for.inc45 + %indvars.iv.next74 = add nuw nsw i64 %indvars.iv73, 1 + %exitcond75 = icmp ne i64 %indvars.iv.next74, 8 + br i1 %exitcond75, label %for.cond13.preheader, label %for.inc51 + +for.inc51: ; preds = %for.inc48 + %indvars.iv.next77 = add nuw nsw i64 %indvars.iv76, 1 + %exitcond78 = icmp ne i64 %indvars.iv.next77, 1024 + br i1 %exitcond78, label %for.cond10.preheader, label %for.inc54 + +for.inc54: ; preds = %for.inc51 + %indvars.iv.next80 = add nuw nsw i64 %indvars.iv79, 1 + %exitcond81 = icmp ne i64 %indvars.iv.next80, 8 + br i1 %exitcond81, label %for.cond7.preheader, label %for.inc57 + +for.inc57: ; preds = %for.inc54 + %indvars.iv.next83 = add nuw nsw i64 %indvars.iv82, 1 + %exitcond84 = icmp ne i64 %indvars.iv.next83, 16 + br i1 %exitcond84, label %for.cond4.preheader, label %for.inc60 + +for.inc60: ; preds = %for.inc57 + %indvars.iv.next86 = add nuw nsw i64 %indvars.iv85, 1 + %exitcond87 = icmp ne i64 %indvars.iv.next86, 16 + br i1 %exitcond87, label %for.cond1.preheader, label %for.end62 + +for.end62: ; preds = %for.inc60 + ret void +} Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_21.ll =================================================================== --- /dev/null +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_21.ll @@ -0,0 +1,64 @@ +; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \ +; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s +; REQUIRES: asserts +; +; for (int i = 0; i < 32; i++) +; for (int j = 0; j < 32; j++) +; for (int l = 0; l < 32; l++) +; for (int w = 0; w < 32; w++) +; C[i][j] += A[i][l][w] * B[w][j][i]; +; +; CHECK-NOT: The tensor contraction pattern was detected +; +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define void @foo([64 x double]* noundef %C, [64 x [64 x double]]* noundef %A, [64 x [64 x double]]* noundef %B) { +entry: + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.inc33, %entry + %indvars.iv49 = phi i64 [ 0, %entry ], [ %indvars.iv.next50, %for.inc33 ] + br label %for.cond5.preheader + +for.cond5.preheader: ; preds = %for.inc30, %for.cond1.preheader + %indvars.iv45 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next46, %for.inc30 ] + br label %for.cond9.preheader + +for.cond9.preheader: ; preds = %for.inc27, %for.cond5.preheader + %indvars.iv41 = phi i64 [ 0, %for.cond5.preheader ], [ %indvars.iv.next42, %for.inc27 ] + br label %for.body12 + +for.body12: ; preds = %for.body12, %for.cond9.preheader + %indvars.iv = phi i64 [ 0, %for.cond9.preheader ], [ %indvars.iv.next, %for.body12 ] + %arrayidx16 = getelementptr inbounds [64 x [64 x double]], [64 x [64 x double]]* %A, i64 %indvars.iv49, i64 %indvars.iv41, i64 %indvars.iv + %i = load double, double* %arrayidx16, align 8 + %arrayidx22 = getelementptr inbounds [64 x [64 x double]], [64 x [64 x double]]* %B, i64 %indvars.iv, i64 %indvars.iv45, i64 %indvars.iv49 + %i1 = load double, double* %arrayidx22, align 8 + %mul = fmul fast double %i1, %i + %arrayidx26 = getelementptr inbounds [64 x double], [64 x double]* %C, i64 %indvars.iv49, i64 %indvars.iv45 + %i2 = load double, double* %arrayidx26, align 8 + %add = fadd fast double %i2, %mul + store double %add, double* %arrayidx26, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 32 + br i1 %exitcond, label %for.body12, label %for.inc27 + +for.inc27: ; preds = %for.body12 + %indvars.iv.next42 = add nuw nsw i64 %indvars.iv41, 1 + %exitcond44 = icmp ne i64 %indvars.iv.next42, 32 + br i1 %exitcond44, label %for.cond9.preheader, label %for.inc30 + +for.inc30: ; preds = %for.inc27 + %indvars.iv.next46 = add nuw nsw i64 %indvars.iv45, 1 + %exitcond48 = icmp ne i64 %indvars.iv.next46, 32 + br i1 %exitcond48, label %for.cond5.preheader, label %for.inc33 + +for.inc33: ; preds = %for.inc30 + %indvars.iv.next50 = add nuw nsw i64 %indvars.iv49, 1 + %exitcond52 = icmp ne i64 %indvars.iv.next50, 32 + br i1 %exitcond52, label %for.cond1.preheader, label %for.end35 + +for.end35: ; preds = %for.inc33 + ret void +} Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_22.ll =================================================================== --- /dev/null +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_22.ll @@ -0,0 +1,65 @@ +; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \ +; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s +; REQUIRES: asserts +; +; for (int i = 0; i < 32; i++) +; for (int j = 0; j < 32; j++) +; for (int l = 0; l < 32; l++) +; for (int w = 0; w < 32; w++) +; C[i][j] += A[i][l][w] * B[w][j][i+3]; +; +; CHECK-NOT: The tensor contraction pattern was detected +; +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define void @foo([64 x double]* noundef %C, [64 x [64 x double]]* noundef %A, [64 x [64 x double]]* noundef %B) { +entry: + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.inc34, %entry + %indvars.iv50 = phi i64 [ 0, %entry ], [ %indvars.iv.next51, %for.inc34 ] + br label %for.cond5.preheader + +for.cond5.preheader: ; preds = %for.inc31, %for.cond1.preheader + %indvars.iv46 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next47, %for.inc31 ] + br label %for.cond9.preheader + +for.cond9.preheader: ; preds = %for.inc28, %for.cond5.preheader + %indvars.iv42 = phi i64 [ 0, %for.cond5.preheader ], [ %indvars.iv.next43, %for.inc28 ] + br label %for.body12 + +for.body12: ; preds = %for.body12, %for.cond9.preheader + %indvars.iv = phi i64 [ 0, %for.cond9.preheader ], [ %indvars.iv.next, %for.body12 ] + %arrayidx16 = getelementptr inbounds [64 x [64 x double]], [64 x [64 x double]]* %A, i64 %indvars.iv50, i64 %indvars.iv42, i64 %indvars.iv + %i = load double, double* %arrayidx16, align 8 + %i1 = add nuw nsw i64 %indvars.iv50, 3 + %arrayidx22 = getelementptr inbounds [64 x [64 x double]], [64 x [64 x double]]* %B, i64 %indvars.iv, i64 %indvars.iv46, i64 %i1 + %i2 = load double, double* %arrayidx22, align 8 + %mul = fmul fast double %i2, %i + %arrayidx26 = getelementptr inbounds [64 x double], [64 x double]* %C, i64 %indvars.iv50, i64 %indvars.iv46 + %i3 = load double, double* %arrayidx26, align 8 + %add27 = fadd fast double %i3, %mul + store double %add27, double* %arrayidx26, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 32 + br i1 %exitcond, label %for.body12, label %for.inc28 + +for.inc28: ; preds = %for.body12 + %indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1 + %exitcond45 = icmp ne i64 %indvars.iv.next43, 32 + br i1 %exitcond45, label %for.cond9.preheader, label %for.inc31 + +for.inc31: ; preds = %for.inc28 + %indvars.iv.next47 = add nuw nsw i64 %indvars.iv46, 1 + %exitcond49 = icmp ne i64 %indvars.iv.next47, 32 + br i1 %exitcond49, label %for.cond5.preheader, label %for.inc34 + +for.inc34: ; preds = %for.inc31 + %indvars.iv.next51 = add nuw nsw i64 %indvars.iv50, 1 + %exitcond54 = icmp ne i64 %indvars.iv.next51, 32 + br i1 %exitcond54, label %for.cond1.preheader, label %for.end36 + +for.end36: ; preds = %for.inc34 + ret void +} Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_23.ll =================================================================== --- /dev/null +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_23.ll @@ -0,0 +1,79 @@ +; RUN: opt %loadPolly -polly-delicm -polly-simplify -polly-opt-isl \ +; RUN: -polly-pattern-matching-based-opts=true \ +; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s +; REQUIRES: asserts +; +; Check that a region statement, which has the correct order of accesses, is not +; detected. +; +; for (int i = 0; i < 32; i++) +; for (int j = 0; j < 32; j++) +; for (int k = 0; k < 32; k++) { +; int c = C[i][j]; +; if (i*j*k < 10) { +; C[i][j] = A[i][k] + B[k][j]; +; } else { +; C[i][j] = c; +; } +; } +; +; CHECK-NOT: The tensor contraction pattern was detected +; +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define dso_local void @foo([64 x double]* noundef %C, [64 x double]* noundef %A, [64 x double]* noundef %B) #0 { +entry: + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %entry, %for.inc34 + %indvars.iv48 = phi i64 [ 0, %entry ], [ %indvars.iv.next49, %for.inc34 ] + br label %for.cond5.preheader + +for.cond5.preheader: ; preds = %for.cond1.preheader, %for.inc31 + %indvars.iv43 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next44, %for.inc31 ] + br label %for.body8 + +for.body8: ; preds = %for.cond5.preheader, %if.end + %indvars.iv = phi i64 [ 0, %for.cond5.preheader ], [ %indvars.iv.next, %if.end ] + %arrayidx10 = getelementptr inbounds [64 x double], [64 x double]* %C, i64 %indvars.iv48, i64 %indvars.iv43 + %0 = mul nuw nsw i64 %indvars.iv43, %indvars.iv48 + %1 = mul nuw nsw i64 %0, %indvars.iv + %cmp12 = icmp ult i64 %1, 10 + br i1 %cmp12, label %if.then, label %if.else + +if.then: ; preds = %for.body8 + %arrayidx17 = getelementptr inbounds [64 x double], [64 x double]* %A, i64 %indvars.iv48, i64 %indvars.iv + %2 = load double, double* %arrayidx17, align 8 + %arrayidx21 = getelementptr inbounds [64 x double], [64 x double]* %B, i64 %indvars.iv, i64 %indvars.iv43 + %3 = load double, double* %arrayidx21, align 8 + %add = fadd fast double %3, %2 + br label %if.end + +if.else: ; preds = %for.body8 + %4 = load double, double* %arrayidx10, align 8 + %conv = fptosi double %4 to i32 + %conv26 = sitofp i32 %conv to double + br label %if.end + +if.end: ; preds = %if.else, %if.then + %storemerge = phi double [ %conv26, %if.else ], [ %add, %if.then ] + store double %storemerge, double* %arrayidx10, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 32 + br i1 %exitcond, label %for.body8, label %for.inc31 + +for.inc31: ; preds = %if.end + %indvars.iv.next44 = add nuw nsw i64 %indvars.iv43, 1 + %exitcond47 = icmp ne i64 %indvars.iv.next44, 32 + br i1 %exitcond47, label %for.cond5.preheader, label %for.inc34 + +for.inc34: ; preds = %for.inc31 + %indvars.iv.next49 = add nuw nsw i64 %indvars.iv48, 1 + %exitcond51 = icmp ne i64 %indvars.iv.next49, 32 + br i1 %exitcond51, label %for.cond1.preheader, label %for.end36 + +for.end36: ; preds = %for.inc34 + ret void +} Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_24.ll =================================================================== --- /dev/null +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_24.ll @@ -0,0 +1,65 @@ +; RUN: opt %loadPolly -polly-reschedule=0 -polly-opt-isl \ +; RUN: -polly-pattern-matching-based-opts=true -polly-tc-opt=true \ +; RUN: -debug -disable-output < %s 2>&1 | FileCheck %s +; REQUIRES: asserts +; +; for (i = 0; i < 1024; i++) +; for (j = 0; j < 1024; j++) +; for (l = 0; l < 64; ++l) +; for (w = 0; w < 64; ++w) +; C[i][j] += A[i][l][w] * B[w][j][l]; +; +; CHECK: The tensor contraction pattern was detected +; +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define internal void @kernel_tc(i32 %ni, i32 %nj, i32 %nl, i32 %nq, i32 %nw, double %alpha, double %beta, [1024 x double]* %C, [64 x [64 x double]]* %A, [1024 x [64 x double]]* %B) { +entry: + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.inc30, %entry + %indvars.iv43 = phi i64 [ 0, %entry ], [ %indvars.iv.next44, %for.inc30 ] + br label %for.cond4.preheader + +for.cond4.preheader: ; preds = %for.inc27, %for.cond1.preheader + %indvars.iv40 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next41, %for.inc27 ] + br label %for.cond7.preheader + +for.cond7.preheader: ; preds = %for.inc24, %for.cond4.preheader + %indvars.iv37 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next38, %for.inc24 ] + br label %for.body9 + +for.body9: ; preds = %for.body9, %for.cond7.preheader + %indvars.iv = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next, %for.body9 ] + %arrayidx13 = getelementptr inbounds [64 x [64 x double]], [64 x [64 x double]]* %A, i64 %indvars.iv43, i64 %indvars.iv37, i64 %indvars.iv + %i = load double, double* %arrayidx13, align 8 + %arrayidx19 = getelementptr inbounds [1024 x [64 x double]], [1024 x [64 x double]]* %B, i64 %indvars.iv, i64 %indvars.iv40, i64 %indvars.iv37 + %i1 = load double, double* %arrayidx19, align 8 + %mul = fmul fast double %i1, %i + %arrayidx23 = getelementptr inbounds [1024 x double], [1024 x double]* %C, i64 %indvars.iv43, i64 %indvars.iv40 + %i2 = load double, double* %arrayidx23, align 8 + %add = fadd fast double %i2, %mul + store double %add, double* %arrayidx23, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 64 + br i1 %exitcond, label %for.body9, label %for.inc24 + +for.inc24: ; preds = %for.body9 + %indvars.iv.next38 = add nuw nsw i64 %indvars.iv37, 1 + %exitcond39 = icmp ne i64 %indvars.iv.next38, 64 + br i1 %exitcond39, label %for.cond7.preheader, label %for.inc27 + +for.inc27: ; preds = %for.inc24 + %indvars.iv.next41 = add nuw nsw i64 %indvars.iv40, 1 + %exitcond42 = icmp ne i64 %indvars.iv.next41, 1024 + br i1 %exitcond42, label %for.cond4.preheader, label %for.inc30 + +for.inc30: ; preds = %for.inc27 + %indvars.iv.next44 = add nuw nsw i64 %indvars.iv43, 1 + %exitcond45 = icmp ne i64 %indvars.iv.next44, 1024 + br i1 %exitcond45, label %for.cond1.preheader, label %for.end32 + +for.end32: ; preds = %for.inc30 + ret void +} Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll =================================================================== --- polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll @@ -1,12 +1,13 @@ ; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \ -; RUN: -debug < %s 2>&1 | FileCheck %s -; RUN: opt %loadPolly -polly-pattern-matching-based-opts=true \ +; RUN: -debug -polly-tc-opt=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \ ; RUN: -polly-target-throughput-vector-fma=1 \ ; RUN: -polly-target-latency-vector-fma=8 \ ; RUN: -polly-target-1st-cache-level-size=32768 \ ; RUN: -polly-target-vector-register-bitwidth=256 \ -; RUN: -polly-target-2nd-cache-level-size=262144 \ -; RUN: -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS +; RUN: -polly-target-2nd-cache-level-size=262144 -polly-print-ast \ +; RUN: -polly-tc-opt=true -disable-output -polly-opt-isl < %s | \ +; RUN: FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS ; REQUIRES: asserts ; ; C := A * B + C @@ -20,6 +21,7 @@ ; for (j = 0; j < _PB_NJ; j++) ; C[i][j] += A[i][k] * B[k][j]; ; +; CHECK: The tensor contraction pattern was detected ; CHECK: The matrix multiplication pattern was detected ; ; PATTERN-MATCHING-OPTS: // 1st level tiling - Tiles